[PATCH v2] drm/amdkfd: Account for SH/SE count when setting up cu masks.
Felix Kuehling
felix.kuehling at amd.com
Wed Aug 25 00:53:37 UTC 2021
Am 2021-08-23 um 8:36 p.m. schrieb Sean Keely:
> On systems with multiple SH per SE compute_static_thread_mgmt_se#
> is split into independent masks, one for each SH, in the upper and
> lower 16 bits. We need to detect this and apply cu masking to each
> SH. The cu mask bits are assigned first to each SE, then to
> alternate SHs, then finally to higher CU id. This ensures that
> the maximum number of SPIs are engaged as early as possible while
> balancing CU assignment to each SH.
>
> v2: Use max SH/SE rather than max SH in cu_per_sh.
>
> Signed-off-by: Sean Keely <Sean.Keely at amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 79 ++++++++++++++------
> drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 1 +
> 2 files changed, 59 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index 88813dad731f..5d7016928d24 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -98,36 +98,73 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
> uint32_t *se_mask)
> {
> struct kfd_cu_info cu_info;
> - uint32_t cu_per_se[KFD_MAX_NUM_SE] = {0};
> - int i, se, sh, cu = 0;
> -
> + uint32_t cu_per_sh[KFD_MAX_NUM_SE][KFD_MAX_NUM_SH_PER_SE] = {0};
> + int i, se, sh, cu;
> amdgpu_amdkfd_get_cu_info(mm->dev->kgd, &cu_info);
>
> if (cu_mask_count > cu_info.cu_active_number)
> cu_mask_count = cu_info.cu_active_number;
>
> + // Exceeding these bounds corrupts the stack and indicates a coding error.
> + // Returning with no CU's enabled will hang the queue, which should be
> + // attention grabbing.
> + if (cu_info.num_shader_engines > KFD_MAX_NUM_SE) {
> + pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", cu_info.num_shader_engines);
> + return;
> + }
> + if (cu_info.num_shader_arrays_per_engine > KFD_MAX_NUM_SH_PER_SE) {
> + pr_err("Exceeded KFD_MAX_NUM_SH, chip reports %d\n",
> + cu_info.num_shader_arrays_per_engine * cu_info.num_shader_engines);
> + return;
> + }
> +
> + /* Count active CUs per SH.
> + *
> + * Some CUs in an SH may be disabled. HW expects disabled CUs to be
> + * represented in the high bits of each SH's enable mask (the upper and lower
> + * 16 bits of se_mask) and will take care of the actual distribution of
> + * disabled CUs within each SH automatically.
> + * Each half of se_mask must be filled compactly, LSB first.
> + *
> + * See note on Arcturus cu_bitmap layout in gfx_v9_0_get_cu_info.
> + */
> for (se = 0; se < cu_info.num_shader_engines; se++)
> for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++)
> - cu_per_se[se] += hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
> -
> - /* Symmetrically map cu_mask to all SEs:
> - * cu_mask[0] bit0 -> se_mask[0] bit0;
> - * cu_mask[0] bit1 -> se_mask[1] bit0;
> - * ... (if # SE is 4)
> - * cu_mask[0] bit4 -> se_mask[0] bit1;
> + cu_per_sh[se][sh] = hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
> +
> + /* Symmetrically map cu_mask to all SEs & SHs:
> + * se_mask programs up to 2 SH in the upper and lower 16 bits.
> + *
> + * Examples
> + * Assuming 1 SH/SE, 4 SEs:
> + * cu_mask[0] bit0 -> se_mask[0] bit0
> + * cu_mask[0] bit1 -> se_mask[1] bit0
> + * ...
> + * cu_mask[0] bit4 -> se_mask[0] bit1
> + * ...
> + *
> + * Assuming 2 SH/SE, 4 SEs
> + * cu_mask[0] bit0 -> se_mask[0] bit0 (SE0,SH0,CU0)
> + * cu_mask[0] bit1 -> se_mask[1] bit0 (SE1,SH0,CU0)
> + * ...
> + * cu_mask[0] bit4 -> se_mask[0] bit16 (SE0,SH1,CU0)
> + * cu_mask[0] bit5 -> se_mask[1] bit16 (SE1,SH1,CU0)
> + * ...
> + * cu_mask[0] bit8 -> se_mask[0] bit1 (SE0,SH0,CU1)
> * ...
> */
> - se = 0;
> - for (i = 0; i < cu_mask_count; i++) {
> - if (cu_mask[i / 32] & (1 << (i % 32)))
> - se_mask[se] |= 1 << cu;
> -
> - do {
> - se++;
> - if (se == cu_info.num_shader_engines) {
> - se = 0;
> - cu++;
> + i = 0;
> + for (cu = 0; cu < 16; cu++) {
> + for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) {
> + for (se = 0; se < cu_info.num_shader_engines; se++) {
> + if ((cu_mask[i / 32] & (1 << (i % 32))) &&
> + (cu_per_sh[se][sh] > cu)) {
You need to increment i when cu_mask[i/32] & (1 << (i % 32)) is 0.
Otherwise you'll get stuck on any 0 bit in the CU mask.
Regards,
Felix
> + se_mask[se] |= 1 << (cu + sh * 16);
> + i++;
> + if (i == cu_mask_count)
> + return;
> + }
> }
> - } while (cu >= cu_per_se[se] && cu < 32);
> + }
> }
> }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> index b5e2ea7550d4..6e6918ccedfd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> @@ -27,6 +27,7 @@
> #include "kfd_priv.h"
>
> #define KFD_MAX_NUM_SE 8
> +#define KFD_MAX_NUM_SH_PER_SE 2
>
> /**
> * struct mqd_manager
More information about the amd-gfx
mailing list