[PATCH 2/2] drm/amdkfd: drop struct kfd_cu_info

Wed Sep 27 16:49:06 UTC 2023

On 2023-09-26 12:39, Alex Deucher wrote:
> I think this was an abstraction back from when
> kfd supported both radeon and amdgpu.  Since we just
> support amdgpu now, there is no more need for this and
> we can use the amdgpu structures directly.
>
> This also avoids having the kfd_cu_info structures on
> the stack when inlining which can blow up the stack.
>
> Cc: Arnd Bergmann <arnd at kernel.org>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>

Thanks for this cleanup. The patch is

Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    | 22 ---------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  2 -
>   drivers/gpu/drm/amd/amdkfd/kfd_crat.c         | 28 +++++------
>   drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  | 28 +++++------
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.c     | 49 ++++++++-----------
>   .../gpu/drm/amd/include/kgd_kfd_interface.h   | 14 ------
>   6 files changed, 48 insertions(+), 95 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 38b5457baded..d95fd76102d6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -467,28 +467,6 @@ uint32_t amdgpu_amdkfd_get_max_engine_clock_in_mhz(struct amdgpu_device *adev)
>   		return 100;
>   }
>   
> -void amdgpu_amdkfd_get_cu_info(struct amdgpu_device *adev, struct kfd_cu_info *cu_info)
> -{
> -	struct amdgpu_cu_info acu_info = adev->gfx.cu_info;
> -
> -	memset(cu_info, 0, sizeof(*cu_info));
> -	if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap))
> -		return;
> -
> -	cu_info->cu_active_number = acu_info.number;
> -	cu_info->cu_ao_mask = acu_info.ao_cu_mask;
> -	memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0],
> -	       sizeof(cu_info->cu_bitmap));
> -	cu_info->num_shader_engines = adev->gfx.config.max_shader_engines;
> -	cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se;
> -	cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh;
> -	cu_info->simd_per_cu = acu_info.simd_per_cu;
> -	cu_info->max_waves_per_simd = acu_info.max_waves_per_simd;
> -	cu_info->wave_front_size = acu_info.wave_front_size;
> -	cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu;
> -	cu_info->lds_size = acu_info.lds_size;
> -}
> -
>   int amdgpu_amdkfd_get_dmabuf_info(struct amdgpu_device *adev, int dma_buf_fd,
>   				  struct amdgpu_device **dmabuf_adev,
>   				  uint64_t *bo_size, void *metadata_buffer,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 609a6fefd85f..3ad8dc523b42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -236,8 +236,6 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev,
>   uint64_t amdgpu_amdkfd_get_gpu_clock_counter(struct amdgpu_device *adev);
>   
>   uint32_t amdgpu_amdkfd_get_max_engine_clock_in_mhz(struct amdgpu_device *adev);
> -void amdgpu_amdkfd_get_cu_info(struct amdgpu_device *adev,
> -			       struct kfd_cu_info *cu_info);
>   int amdgpu_amdkfd_get_dmabuf_info(struct amdgpu_device *adev, int dma_buf_fd,
>   				  struct amdgpu_device **dmabuf_adev,
>   				  uint64_t *bo_size, void *metadata_buffer,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> index 9459603804b9..0e792a8496d6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> @@ -2038,11 +2038,12 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
>   				      uint32_t proximity_domain)
>   {
>   	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
> +	struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config;
> +	struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info;
>   	struct crat_subtype_generic *sub_type_hdr;
>   	struct kfd_local_mem_info local_mem_info;
>   	struct kfd_topology_device *peer_dev;
>   	struct crat_subtype_computeunit *cu;
> -	struct kfd_cu_info cu_info;
>   	int avail_size = *size;
>   	uint32_t total_num_of_cu;
>   	uint32_t nid = 0;
> @@ -2086,21 +2087,20 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
>   	cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
>   	cu->proximity_domain = proximity_domain;
>   
> -	amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
> -	cu->num_simd_per_cu = cu_info.simd_per_cu;
> -	cu->num_simd_cores = cu_info.simd_per_cu *
> -			(cu_info.cu_active_number / kdev->kfd->num_nodes);
> -	cu->max_waves_simd = cu_info.max_waves_per_simd;
> +	cu->num_simd_per_cu = cu_info->simd_per_cu;
> +	cu->num_simd_cores = cu_info->simd_per_cu *
> +			(cu_info->number / kdev->kfd->num_nodes);
> +	cu->max_waves_simd = cu_info->max_waves_per_simd;
>   
> -	cu->wave_front_size = cu_info.wave_front_size;
> -	cu->array_count = cu_info.num_shader_arrays_per_engine *
> -		cu_info.num_shader_engines;
> -	total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
> +	cu->wave_front_size = cu_info->wave_front_size;
> +	cu->array_count = gfx_info->max_sh_per_se *
> +		gfx_info->max_shader_engines;
> +	total_num_of_cu = (cu->array_count * gfx_info->max_cu_per_sh);
>   	cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
> -	cu->num_cu_per_array = cu_info.num_cu_per_sh;
> -	cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
> -	cu->num_banks = cu_info.num_shader_engines;
> -	cu->lds_size_in_kb = cu_info.lds_size;
> +	cu->num_cu_per_array = gfx_info->max_cu_per_sh;
> +	cu->max_slots_scatch_cu = cu_info->max_scratch_slots_per_cu;
> +	cu->num_banks = gfx_info->max_shader_engines;
> +	cu->lds_size_in_kb = cu_info->lds_size;
>   
>   	cu->hsa_capability = 0;
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index 447829c22295..050a6936ff84 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -99,7 +99,8 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
>   		const uint32_t *cu_mask, uint32_t cu_mask_count,
>   		uint32_t *se_mask, uint32_t inst)
>   {
> -	struct kfd_cu_info cu_info;
> +	struct amdgpu_cu_info *cu_info = &mm->dev->adev->gfx.cu_info;
> +	struct amdgpu_gfx_config *gfx_info = &mm->dev->adev->gfx.config;
>   	uint32_t cu_per_sh[KFD_MAX_NUM_SE][KFD_MAX_NUM_SH_PER_SE] = {0};
>   	bool wgp_mode_req = KFD_GC_VERSION(mm->dev) >= IP_VERSION(10, 0, 0);
>   	uint32_t en_mask = wgp_mode_req ? 0x3 : 0x1;
> @@ -108,9 +109,7 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
>   	int inc = cu_inc * NUM_XCC(mm->dev->xcc_mask);
>   	int xcc_inst = inst + ffs(mm->dev->xcc_mask) - 1;
>   
> -	amdgpu_amdkfd_get_cu_info(mm->dev->adev, &cu_info);
> -
> -	cu_active_per_node = cu_info.cu_active_number / mm->dev->kfd->num_nodes;
> +	cu_active_per_node = cu_info->number / mm->dev->kfd->num_nodes;
>   	if (cu_mask_count > cu_active_per_node)
>   		cu_mask_count = cu_active_per_node;
>   
> @@ -118,13 +117,14 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
>   	 * Returning with no CU's enabled will hang the queue, which should be
>   	 * attention grabbing.
>   	 */
> -	if (cu_info.num_shader_engines > KFD_MAX_NUM_SE) {
> -		pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", cu_info.num_shader_engines);
> +	if (gfx_info->max_shader_engines > KFD_MAX_NUM_SE) {
> +		pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n",
> +		       gfx_info->max_shader_engines);
>   		return;
>   	}
> -	if (cu_info.num_shader_arrays_per_engine > KFD_MAX_NUM_SH_PER_SE) {
> +	if (gfx_info->max_sh_per_se > KFD_MAX_NUM_SH_PER_SE) {
>   		pr_err("Exceeded KFD_MAX_NUM_SH, chip reports %d\n",
> -			cu_info.num_shader_arrays_per_engine * cu_info.num_shader_engines);
> +			gfx_info->max_sh_per_se * gfx_info->max_shader_engines);
>   		return;
>   	}
>   
> @@ -142,10 +142,10 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
>   	 * See note on Arcturus cu_bitmap layout in gfx_v9_0_get_cu_info.
>   	 * See note on GFX11 cu_bitmap layout in gfx_v11_0_get_cu_info.
>   	 */
> -	for (se = 0; se < cu_info.num_shader_engines; se++)
> -		for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++)
> +	for (se = 0; se < gfx_info->max_shader_engines; se++)
> +		for (sh = 0; sh < gfx_info->max_sh_per_se; sh++)
>   			cu_per_sh[se][sh] = hweight32(
> -				cu_info.cu_bitmap[xcc_inst][se % 4][sh + (se / 4) *
> +				cu_info->bitmap[xcc_inst][se % 4][sh + (se / 4) *
>   				cu_bitmap_sh_mul]);
>   
>   	/* Symmetrically map cu_mask to all SEs & SHs:
> @@ -184,13 +184,13 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
>   	 *
>   	 * First ensure all CUs are disabled, then enable user specified CUs.
>   	 */
> -	for (i = 0; i < cu_info.num_shader_engines; i++)
> +	for (i = 0; i < gfx_info->max_shader_engines; i++)
>   		se_mask[i] = 0;
>   
>   	i = inst;
>   	for (cu = 0; cu < 16; cu += cu_inc) {
> -		for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) {
> -			for (se = 0; se < cu_info.num_shader_engines; se++) {
> +		for (sh = 0; sh < gfx_info->max_sh_per_se; sh++) {
> +			for (se = 0; se < gfx_info->max_shader_engines; se++) {
>   				if (cu_per_sh[se][sh] > cu) {
>   					if (cu_mask[i / 32] & (en_mask << (i % 32)))
>   						se_mask[se] |= en_mask << (cu + sh * 16);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 3f9f882d3f5c..4e530791507e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1533,7 +1533,6 @@ static int kfd_dev_create_p2p_links(void)
>   /* Helper function. See kfd_fill_gpu_cache_info for parameter description */
>   static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext,
>   				struct kfd_gpu_cache_info *pcache_info,
> -				struct kfd_cu_info *cu_info,
>   				int cu_bitmask,
>   				int cache_type, unsigned int cu_processor_id,
>   				int cu_block)
> @@ -1595,7 +1594,8 @@ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext,
>   /* Helper function. See kfd_fill_gpu_cache_info for parameter description */
>   static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
>   				struct kfd_gpu_cache_info *pcache_info,
> -				struct kfd_cu_info *cu_info,
> +				struct amdgpu_cu_info *cu_info,
> +				struct amdgpu_gfx_config *gfx_info,
>   				int cache_type, unsigned int cu_processor_id,
>   				struct kfd_node *knode)
>   {
> @@ -1606,7 +1606,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
>   
>   	start = ffs(knode->xcc_mask) - 1;
>   	end = start + NUM_XCC(knode->xcc_mask);
> -	cu_sibling_map_mask = cu_info->cu_bitmap[start][0][0];
> +	cu_sibling_map_mask = cu_info->bitmap[start][0][0];
>   	cu_sibling_map_mask &=
>   		((1 << pcache_info[cache_type].num_cu_shared) - 1);
>   	first_active_cu = ffs(cu_sibling_map_mask);
> @@ -1642,15 +1642,15 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
>   		k = 0;
>   
>   		for (xcc = start; xcc < end; xcc++) {
> -			for (i = 0; i < cu_info->num_shader_engines; i++) {
> -				for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
> +			for (i = 0; i < gfx_info->max_shader_engines; i++) {
> +				for (j = 0; j < gfx_info->max_sh_per_se; j++) {
>   					pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
>   					pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
>   					pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
>   					pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
>   					k += 4;
>   
> -					cu_sibling_map_mask = cu_info->cu_bitmap[xcc][i % 4][j + i / 4];
> +					cu_sibling_map_mask = cu_info->bitmap[xcc][i % 4][j + i / 4];
>   					cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
>   				}
>   			}
> @@ -1675,16 +1675,14 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
>   	unsigned int cu_processor_id;
>   	int ret;
>   	unsigned int num_cu_shared;
> -	struct kfd_cu_info cu_info;
> -	struct kfd_cu_info *pcu_info;
> +	struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info;
> +	struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config;
>   	int gpu_processor_id;
>   	struct kfd_cache_properties *props_ext;
>   	int num_of_entries = 0;
>   	int num_of_cache_types = 0;
>   	struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
>   
> -	amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
> -	pcu_info = &cu_info;
>   
>   	gpu_processor_id = dev->node_props.simd_id_base;
>   
> @@ -1711,12 +1709,12 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
>   		cu_processor_id = gpu_processor_id;
>   		if (pcache_info[ct].cache_level == 1) {
>   			for (xcc = start; xcc < end; xcc++) {
> -				for (i = 0; i < pcu_info->num_shader_engines; i++) {
> -					for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
> -						for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
> +				for (i = 0; i < gfx_info->max_shader_engines; i++) {
> +					for (j = 0; j < gfx_info->max_sh_per_se; j++) {
> +						for (k = 0; k < gfx_info->max_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
>   
> -							ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
> -										pcu_info->cu_bitmap[xcc][i % 4][j + i / 4], ct,
> +							ret = fill_in_l1_pcache(&props_ext, pcache_info,
> +										cu_info->bitmap[xcc][i % 4][j + i / 4], ct,
>   										cu_processor_id, k);
>   
>   							if (ret < 0)
> @@ -1729,9 +1727,9 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
>   
>   							/* Move to next CU block */
>   							num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
> -								pcu_info->num_cu_per_sh) ?
> +								gfx_info->max_cu_per_sh) ?
>   								pcache_info[ct].num_cu_shared :
> -								(pcu_info->num_cu_per_sh - k);
> +								(gfx_info->max_cu_per_sh - k);
>   							cu_processor_id += num_cu_shared;
>   						}
>   					}
> @@ -1739,7 +1737,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
>   			}
>   		} else {
>   			ret = fill_in_l2_l3_pcache(&props_ext, pcache_info,
> -					pcu_info, ct, cu_processor_id, kdev);
> +						   cu_info, gfx_info, ct, cu_processor_id, kdev);
>   
>   			if (ret < 0)
>   				break;
> @@ -1918,10 +1916,11 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   {
>   	uint32_t gpu_id;
>   	struct kfd_topology_device *dev;
> -	struct kfd_cu_info *cu_info;
>   	int res = 0;
>   	int i;
>   	const char *asic_name = amdgpu_asic_name[gpu->adev->asic_type];
> +	struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config;
> +	struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info;
>   
>   	gpu_id = kfd_generate_gpu_id(gpu);
>   	if (gpu->xcp && !gpu->xcp->ddev) {
> @@ -1959,12 +1958,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   	/* Fill-in additional information that is not available in CRAT but
>   	 * needed for the topology
>   	 */
> -	cu_info = kzalloc(sizeof(struct kfd_cu_info), GFP_KERNEL);
> -	if (!cu_info)
> -		return -ENOMEM;
> -
> -	amdgpu_amdkfd_get_cu_info(dev->gpu->adev, cu_info);
> -
>   	for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1; i++) {
>   		dev->node_props.name[i] = __tolower(asic_name[i]);
>   		if (asic_name[i] == '\0')
> @@ -1973,7 +1966,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   	dev->node_props.name[i] = '\0';
>   
>   	dev->node_props.simd_arrays_per_engine =
> -		cu_info->num_shader_arrays_per_engine;
> +		gfx_info->max_sh_per_se;
>   
>   	dev->node_props.gfx_target_version =
>   				gpu->kfd->device_info.gfx_target_version;
> @@ -2054,7 +2047,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   	 */
>   	if (dev->gpu->adev->asic_type == CHIP_CARRIZO) {
>   		dev->node_props.simd_count =
> -			cu_info->simd_per_cu * cu_info->cu_active_number;
> +			cu_info->simd_per_cu * cu_info->number;
>   		dev->node_props.max_waves_per_simd = 10;
>   	}
>   
> @@ -2081,8 +2074,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   
>   	kfd_notify_gpu_change(gpu_id, 1);
>   
> -	kfree(cu_info);
> -
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> index 255adc30f802..6d094cf3587d 100644
> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> @@ -57,20 +57,6 @@ struct kfd_vm_fault_info {
>   	bool		prot_exec;
>   };
>   
> -struct kfd_cu_info {
> -	uint32_t num_shader_engines;
> -	uint32_t num_shader_arrays_per_engine;
> -	uint32_t num_cu_per_sh;
> -	uint32_t cu_active_number;
> -	uint32_t cu_ao_mask;
> -	uint32_t simd_per_cu;
> -	uint32_t max_waves_per_simd;
> -	uint32_t wave_front_size;
> -	uint32_t max_scratch_slots_per_cu;
> -	uint32_t lds_size;
> -	uint32_t cu_bitmap[AMDGPU_MAX_GC_INSTANCES][4][4];
> -};
> -
>   /* For getting GPU local memory information from KGD */
>   struct kfd_local_mem_info {
>   	uint64_t local_mem_size_private;