[PATCH v3] drm/amdkfd: Fix the warning of array-index-out-of-bounds

Ma, Jun majun at amd.com
Tue Nov 1 06:26:31 UTC 2022



On 10/29/2022 4:17 AM, Felix Kuehling wrote:
> On 2022-10-27 04:14, Ma Jun wrote:
>> For some GPUs with more CUs, the original sibling_map[32]
>>
>> in struct crat_subtype_cache is not enough
>>
>> to save the cache information when create the VCRAT table,
>>
>> so skip filling the struct crat_subtype_cache info instead
>>
>> fill struct kfd_cache_properties directly to fix this problem.
>>
>> v3:
>> - Drop processor id calc function
>> v2:
>> - Remove unnecessary sys interface "cache_ext"
>>
>> Signed-off-by: Ma Jun <Jun.Ma2 at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_crat.c     | 307 +++-------------------
>>   drivers/gpu/drm/amd/amdkfd/kfd_crat.h     |  12 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 238 ++++++++++++++++-
>>   drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   5 +-
>>   4 files changed, 278 insertions(+), 284 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> index d25ac9cbe5b2..8b7e34b45740 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> [snip]
>> +int get_gpu_cache_info(struct kfd_dev *kdev, struct kfd_gpu_cache_info **pcache_info)
>>   {
>> -	struct kfd_gpu_cache_info *pcache_info;
>>   	struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
>>   	int num_of_cache_types = 0;
>> -	int i, j, k;
>> -	int ct = 0;
>> -	int mem_available = available_size;
>> -	unsigned int cu_processor_id;
>> -	int ret;
>> -	unsigned int num_cu_shared;
>>   
>>   	switch (kdev->adev->asic_type) {
> [snip]
>>
>>   	default:
>>   		switch (KFD_GC_VERSION(kdev)) {
> [snip]
>>   		case IP_VERSION(11, 0, 0):
>>   		case IP_VERSION(11, 0, 1):
>>   		case IP_VERSION(11, 0, 2):
>>   		case IP_VERSION(11, 0, 3):
>> -			pcache_info = cache_info;
>> +			*pcache_info = cache_info;
> 
> This won't work. cache_info is a local variable. It will be out of scope 
> as soon as this function returns. You'll need to allocate this in some 
> data structure that will persist after the function returns. Maybe 
> expect the caller to pass in a pointer to an array in their own stack frame.

Yes, this is my mistake. Will fix in next version

> 
> 
>>   			num_of_cache_types =
>> -				kfd_fill_gpu_cache_info_from_gfx_config(kdev, pcache_info);
>> +				kfd_fill_gpu_cache_info_from_gfx_config(kdev, *pcache_info);
> [snip]
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>> index e0680d265a66..dc231e248258 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> 
> [snip]
> 
>  > int kfd_topology_add_device(struct kfd_dev *gpu)
>>   {
>>   	uint32_t gpu_id;
>> @@ -1759,6 +1970,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>>   			topology_crat_proximity_domain--;
>>   			return res;
>>   		}
>> +
>>   		res = kfd_parse_crat_table(crat_image,
>>   					   &temp_topology_device_list,
>>   					   proximity_domain);
>> @@ -1771,23 +1983,31 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>>   
>>   		kfd_topology_update_device_list(&temp_topology_device_list,
>>   			&topology_device_list);
>> +		up_write(&topology_lock);
> 
> I'm not sure if dropping and re-taking the topology lock here could lead 
> to race conditions. But this could be avoided, if you moved the 
> responsibility for topology locking out of kfd_assign_gpu and into the 
> caller (kfd_topology_add_device).

Thanks for review.Will fix in next version.

Regards,
Ma Jun
> 
> Regards,
>    Felix
> 
> 
>> +
>> +		dev = kfd_assign_gpu(gpu);
>> +		if (WARN_ON(!dev)) {
>> +			res = -ENODEV;
>> +			goto err;
>> +		}
>> +
>> +		down_write(&topology_lock);
>> +
>> +		/* Fill the cache affinity information here for the GPUs
>> +		 * using VCRAT
>> +		 */
>> +		kfd_fill_cache_non_crat_info(dev, gpu);
>>   
>>   		/* Update the SYSFS tree, since we added another topology
>>   		 * device
>>   		 */
>>   		res = kfd_topology_update_sysfs();
>>   		up_write(&topology_lock);
>> -
>>   		if (!res)
>>   			sys_props.generation_count++;
>>   		else
>>   			pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
>>   						gpu_id, res);
>> -		dev = kfd_assign_gpu(gpu);
>> -		if (WARN_ON(!dev)) {
>> -			res = -ENODEV;
>> -			goto err;
>> -		}
>>   	}
>>   
>>   	dev->gpu_id = gpu_id;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
>> index dc4e239c8f8f..3e8ac87f0ac9 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
>> @@ -87,6 +87,8 @@ struct kfd_mem_properties {
>>   	struct attribute	attr_used;
>>   };
>>   
>> +#define CACHE_SIBLINGMAP_SIZE 64
>> +
>>   struct kfd_cache_properties {
>>   	struct list_head	list;
>>   	uint32_t		processor_id_low;
>> @@ -97,10 +99,11 @@ struct kfd_cache_properties {
>>   	uint32_t		cache_assoc;
>>   	uint32_t		cache_latency;
>>   	uint32_t		cache_type;
>> -	uint8_t			sibling_map[CRAT_SIBLINGMAP_SIZE];
>> +	uint8_t			sibling_map[CACHE_SIBLINGMAP_SIZE];
>>   	struct kfd_dev		*gpu;
>>   	struct kobject		*kobj;
>>   	struct attribute	attr;
>> +	uint32_t		sibling_map_size;
>>   };
>>   
>>   struct kfd_iolink_properties {


More information about the amd-gfx mailing list