[PATCH v2] drm/amdkfd: Account for SH/SE count when setting up cu masks.

Wed Aug 25 00:53:37 UTC 2021

Am 2021-08-23 um 8:36 p.m. schrieb Sean Keely:
> On systems with multiple SH per SE compute_static_thread_mgmt_se#
> is split into independent masks, one for each SH, in the upper and
> lower 16 bits.  We need to detect this and apply cu masking to each
> SH.  The cu mask bits are assigned first to each SE, then to
> alternate SHs, then finally to higher CU id.  This ensures that
> the maximum number of SPIs are engaged as early as possible while
> balancing CU assignment to each SH.
>
> v2: Use max SH/SE rather than max SH in cu_per_sh.
>
> Signed-off-by: Sean Keely <Sean.Keely at amd.com>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 79 ++++++++++++++------
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h |  1 +
>  2 files changed, 59 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index 88813dad731f..5d7016928d24 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -98,36 +98,73 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
>  		uint32_t *se_mask)
>  {
>  	struct kfd_cu_info cu_info;
> -	uint32_t cu_per_se[KFD_MAX_NUM_SE] = {0};
> -	int i, se, sh, cu = 0;
> -
> +	uint32_t cu_per_sh[KFD_MAX_NUM_SE][KFD_MAX_NUM_SH_PER_SE] = {0};
> +	int i, se, sh, cu;
>  	amdgpu_amdkfd_get_cu_info(mm->dev->kgd, &cu_info);
>  
>  	if (cu_mask_count > cu_info.cu_active_number)
>  		cu_mask_count = cu_info.cu_active_number;
>  
> +	// Exceeding these bounds corrupts the stack and indicates a coding error.
> +	// Returning with no CU's enabled will hang the queue, which should be
> +	// attention grabbing.
> +	if (cu_info.num_shader_engines > KFD_MAX_NUM_SE) {
> +		pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", cu_info.num_shader_engines);
> +		return;
> +	}
> +	if (cu_info.num_shader_arrays_per_engine > KFD_MAX_NUM_SH_PER_SE) {
> +		pr_err("Exceeded KFD_MAX_NUM_SH, chip reports %d\n",
> +			cu_info.num_shader_arrays_per_engine * cu_info.num_shader_engines);
> +		return;
> +	}
> +
> +	/* Count active CUs per SH.
> +	 *
> +	 * Some CUs in an SH may be disabled.	HW expects disabled CUs to be
> +	 * represented in the high bits of each SH's enable mask (the upper and lower
> +	 * 16 bits of se_mask) and will take care of the actual distribution of
> +	 * disabled CUs within each SH automatically.
> +	 * Each half of se_mask must be filled compactly, LSB first.
> +	 *
> +	 * See note on Arcturus cu_bitmap layout in gfx_v9_0_get_cu_info.
> +	 */
>  	for (se = 0; se < cu_info.num_shader_engines; se++)
>  		for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++)
> -			cu_per_se[se] += hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
> -
> -	/* Symmetrically map cu_mask to all SEs:
> -	 * cu_mask[0] bit0 -> se_mask[0] bit0;
> -	 * cu_mask[0] bit1 -> se_mask[1] bit0;
> -	 * ... (if # SE is 4)
> -	 * cu_mask[0] bit4 -> se_mask[0] bit1;
> +			cu_per_sh[se][sh] = hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
> +
> +	/* Symmetrically map cu_mask to all SEs & SHs:
> +	 * se_mask programs up to 2 SH in the upper and lower 16 bits.
> +	 *
> +	 * Examples
> +	 * Assuming 1 SH/SE, 4 SEs:
> +	 * cu_mask[0] bit0 -> se_mask[0] bit0
> +	 * cu_mask[0] bit1 -> se_mask[1] bit0
> +	 * ...
> +	 * cu_mask[0] bit4 -> se_mask[0] bit1
> +	 * ...
> +	 *
> +	 * Assuming 2 SH/SE, 4 SEs
> +	 * cu_mask[0] bit0 -> se_mask[0] bit0 (SE0,SH0,CU0)
> +	 * cu_mask[0] bit1 -> se_mask[1] bit0 (SE1,SH0,CU0)
> +	 * ...
> +	 * cu_mask[0] bit4 -> se_mask[0] bit16 (SE0,SH1,CU0)
> +	 * cu_mask[0] bit5 -> se_mask[1] bit16 (SE1,SH1,CU0)
> +	 * ...
> +	 * cu_mask[0] bit8 -> se_mask[0] bit1 (SE0,SH0,CU1)
>  	 * ...
>  	 */
> -	se = 0;
> -	for (i = 0; i < cu_mask_count; i++) {
> -		if (cu_mask[i / 32] & (1 << (i % 32)))
> -			se_mask[se] |= 1 << cu;
> -
> -		do {
> -			se++;
> -			if (se == cu_info.num_shader_engines) {
> -				se = 0;
> -				cu++;
> +	i = 0;
> +	for (cu = 0; cu < 16; cu++) {
> +		for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) {
> +			for (se = 0; se < cu_info.num_shader_engines; se++) {
> +				if ((cu_mask[i / 32] & (1 << (i % 32))) &&
> +					(cu_per_sh[se][sh] > cu)) {

You need to increment i when cu_mask[i/32] & (1 << (i % 32)) is 0.
Otherwise you'll get stuck on any 0 bit in the CU mask.

Regards,
  Felix

> +					se_mask[se] |= 1 << (cu + sh * 16);
> +					i++;
> +					if (i == cu_mask_count)
> +						return;
> +				}
>  			}
> -		} while (cu >= cu_per_se[se] && cu < 32);
> +		}
>  	}
>  }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> index b5e2ea7550d4..6e6918ccedfd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> @@ -27,6 +27,7 @@
>  #include "kfd_priv.h"
>  
>  #define KFD_MAX_NUM_SE 8
> +#define KFD_MAX_NUM_SH_PER_SE 2
>  
>  /**
>   * struct mqd_manager