[PATCHv2 2/4] drm/amdkfd: Update cache info reporting for GFX v9.4.3

Thu Sep 7 20:51:00 UTC 2023

On 2023-09-06 11:44, Mukul Joshi wrote:
> Update cache info reporting in sysfs to report the correct
> number of CUs and associated cache information based on
> different spatial partitioning modes.
>
> Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
> ---
> v1->v2:
> - Revert the change in kfd_crat.c
> - Add a comment to not change value of CRAT_SIBLINGMAP_SIZE.
>
>   drivers/gpu/drm/amd/amdkfd/kfd_crat.h     |  4 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 82 +++++++++++++----------
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.h |  2 +-
>   3 files changed, 51 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> index 387a8ef49385..74c2d7a0d628 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> @@ -79,6 +79,10 @@ struct crat_header {
>   #define CRAT_SUBTYPE_IOLINK_AFFINITY		5
>   #define CRAT_SUBTYPE_MAX			6
>   
> +/*
> + * Do not change the value of CRAT_SIBLINGMAP_SIZE from 32
> + * as it breaks the ABI.
> + */
>   #define CRAT_SIBLINGMAP_SIZE	32
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index c54795682dfb..b98cc7930e4c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1596,14 +1596,17 @@ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext,
>   static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
>   				struct kfd_gpu_cache_info *pcache_info,
>   				struct kfd_cu_info *cu_info,
> -				int cache_type, unsigned int cu_processor_id)
> +				int cache_type, unsigned int cu_processor_id,
> +				struct kfd_node *knode)
>   {
>   	unsigned int cu_sibling_map_mask;
>   	int first_active_cu;
> -	int i, j, k;
> +	int i, j, k, xcc, start, end;
>   	struct kfd_cache_properties *pcache = NULL;
>   
> -	cu_sibling_map_mask = cu_info->cu_bitmap[0][0][0];
> +	start = ffs(knode->xcc_mask) - 1;
> +	end = start + NUM_XCC(knode->xcc_mask);
> +	cu_sibling_map_mask = cu_info->cu_bitmap[start][0][0];
>   	cu_sibling_map_mask &=
>   		((1 << pcache_info[cache_type].num_cu_shared) - 1);
>   	first_active_cu = ffs(cu_sibling_map_mask);
> @@ -1638,16 +1641,18 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
>   		cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1);
>   		k = 0;
>   
> -		for (i = 0; i < cu_info->num_shader_engines; i++) {
> -			for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
> -				pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
> -				pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
> -				pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
> -				pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
> -				k += 4;
> -
> -				cu_sibling_map_mask = cu_info->cu_bitmap[0][i % 4][j + i / 4];
> -				cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
> +		for (xcc = start; xcc < end; xcc++) {
> +			for (i = 0; i < cu_info->num_shader_engines; i++) {
> +				for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
> +					pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
> +					pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
> +					pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
> +					pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
> +					k += 4;
> +
> +					cu_sibling_map_mask = cu_info->cu_bitmap[start][i % 4][j + i / 4];

Shouldn't this use "xcc" as index instead of "start"?

Other than that, this patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>

P.S.: This whole function looks suspicious to me the way it exposes 
active/inactive CUs to user mode. I think user mode only wants to count 
active CUs in the sibling mask because it has no other information about 
inactive CUs. And reading the cu_info into cu_sibling_map_mask and then 
only using it in the next loop iteration is probably also wrong. It 
basically throws away cu_info from the last xcc,SE,SA. Or maybe I just 
don't understand what this function is trying to do.

> +					cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
> +				}
>   			}
>   		}
>   		pcache->sibling_map_size = k;
> @@ -1665,7 +1670,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
>   static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct kfd_node *kdev)
>   {
>   	struct kfd_gpu_cache_info *pcache_info = NULL;
> -	int i, j, k;
> +	int i, j, k, xcc, start, end;
>   	int ct = 0;
>   	unsigned int cu_processor_id;
>   	int ret;
> @@ -1699,37 +1704,42 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
>   	 *			then it will consider only one CU from
>   	 *			the shared unit
>   	 */
> +	start = ffs(kdev->xcc_mask) - 1;
> +	end = start + NUM_XCC(kdev->xcc_mask);
> +
>   	for (ct = 0; ct < num_of_cache_types; ct++) {
>   		cu_processor_id = gpu_processor_id;
>   		if (pcache_info[ct].cache_level == 1) {
> -			for (i = 0; i < pcu_info->num_shader_engines; i++) {
> -				for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
> -					for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
> -
> -						ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
> -									pcu_info->cu_bitmap[0][i % 4][j + i / 4], ct,
> -									cu_processor_id, k);
> -
> -						if (ret < 0)
> -							break;
> -
> -						if (!ret) {
> -							num_of_entries++;
> -							list_add_tail(&props_ext->list, &dev->cache_props);
> +			for (xcc = start; xcc < end; xcc++) {
> +				for (i = 0; i < pcu_info->num_shader_engines; i++) {
> +					for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
> +						for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
> +
> +							ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
> +										pcu_info->cu_bitmap[xcc][i % 4][j + i / 4], ct,
> +										cu_processor_id, k);
> +
> +							if (ret < 0)
> +								break;
> +
> +							if (!ret) {
> +								num_of_entries++;
> +								list_add_tail(&props_ext->list, &dev->cache_props);
> +							}
> +
> +							/* Move to next CU block */
> +							num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
> +								pcu_info->num_cu_per_sh) ?
> +								pcache_info[ct].num_cu_shared :
> +								(pcu_info->num_cu_per_sh - k);
> +							cu_processor_id += num_cu_shared;
>   						}
> -
> -						/* Move to next CU block */
> -						num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
> -							pcu_info->num_cu_per_sh) ?
> -							pcache_info[ct].num_cu_shared :
> -							(pcu_info->num_cu_per_sh - k);
> -						cu_processor_id += num_cu_shared;
>   					}
>   				}
>   			}
>   		} else {
>   			ret = fill_in_l2_l3_pcache(&props_ext, pcache_info,
> -								pcu_info, ct, cu_processor_id);
> +					pcu_info, ct, cu_processor_id, kdev);
>   
>   			if (ret < 0)
>   				break;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> index dea32a9e5506..27386ce9a021 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> @@ -89,7 +89,7 @@ struct kfd_mem_properties {
>   	struct attribute	attr;
>   };
>   
> -#define CACHE_SIBLINGMAP_SIZE 64
> +#define CACHE_SIBLINGMAP_SIZE 128
>   
>   struct kfd_cache_properties {
>   	struct list_head	list;