[PATCHv2 2/4] drm/amdkfd: Update cache info reporting for GFX v9.4.3
Felix Kuehling
felix.kuehling at amd.com
Thu Sep 7 20:51:00 UTC 2023
On 2023-09-06 11:44, Mukul Joshi wrote:
> Update cache info reporting in sysfs to report the correct
> number of CUs and associated cache information based on
> different spatial partitioning modes.
>
> Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
> ---
> v1->v2:
> - Revert the change in kfd_crat.c
> - Add a comment to not change value of CRAT_SIBLINGMAP_SIZE.
>
> drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 4 ++
> drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 82 +++++++++++++----------
> drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 2 +-
> 3 files changed, 51 insertions(+), 37 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> index 387a8ef49385..74c2d7a0d628 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> @@ -79,6 +79,10 @@ struct crat_header {
> #define CRAT_SUBTYPE_IOLINK_AFFINITY 5
> #define CRAT_SUBTYPE_MAX 6
>
> +/*
> + * Do not change the value of CRAT_SIBLINGMAP_SIZE from 32
> + * as it breaks the ABI.
> + */
> #define CRAT_SIBLINGMAP_SIZE 32
>
> /*
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index c54795682dfb..b98cc7930e4c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1596,14 +1596,17 @@ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext,
> static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
> struct kfd_gpu_cache_info *pcache_info,
> struct kfd_cu_info *cu_info,
> - int cache_type, unsigned int cu_processor_id)
> + int cache_type, unsigned int cu_processor_id,
> + struct kfd_node *knode)
> {
> unsigned int cu_sibling_map_mask;
> int first_active_cu;
> - int i, j, k;
> + int i, j, k, xcc, start, end;
> struct kfd_cache_properties *pcache = NULL;
>
> - cu_sibling_map_mask = cu_info->cu_bitmap[0][0][0];
> + start = ffs(knode->xcc_mask) - 1;
> + end = start + NUM_XCC(knode->xcc_mask);
> + cu_sibling_map_mask = cu_info->cu_bitmap[start][0][0];
> cu_sibling_map_mask &=
> ((1 << pcache_info[cache_type].num_cu_shared) - 1);
> first_active_cu = ffs(cu_sibling_map_mask);
> @@ -1638,16 +1641,18 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
> cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1);
> k = 0;
>
> - for (i = 0; i < cu_info->num_shader_engines; i++) {
> - for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
> - pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
> - pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
> - pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
> - pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
> - k += 4;
> -
> - cu_sibling_map_mask = cu_info->cu_bitmap[0][i % 4][j + i / 4];
> - cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
> + for (xcc = start; xcc < end; xcc++) {
> + for (i = 0; i < cu_info->num_shader_engines; i++) {
> + for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
> + pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
> + pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
> + pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
> + pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
> + k += 4;
> +
> + cu_sibling_map_mask = cu_info->cu_bitmap[start][i % 4][j + i / 4];
Shouldn't this use "xcc" as index instead of "start"?
Other than that, this patch is
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
P.S.: This whole function looks suspicious to me the way it exposes
active/inactive CUs to user mode. I think user mode only wants to count
active CUs in the sibling mask because it has no other information about
inactive CUs. And reading the cu_info into cu_sibling_map_mask and then
only using it in the next loop iteration is probably also wrong. It
basically throws away cu_info from the last xcc,SE,SA. Or maybe I just
don't understand what this function is trying to do.
> + cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
> + }
> }
> }
> pcache->sibling_map_size = k;
> @@ -1665,7 +1670,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
> static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct kfd_node *kdev)
> {
> struct kfd_gpu_cache_info *pcache_info = NULL;
> - int i, j, k;
> + int i, j, k, xcc, start, end;
> int ct = 0;
> unsigned int cu_processor_id;
> int ret;
> @@ -1699,37 +1704,42 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
> * then it will consider only one CU from
> * the shared unit
> */
> + start = ffs(kdev->xcc_mask) - 1;
> + end = start + NUM_XCC(kdev->xcc_mask);
> +
> for (ct = 0; ct < num_of_cache_types; ct++) {
> cu_processor_id = gpu_processor_id;
> if (pcache_info[ct].cache_level == 1) {
> - for (i = 0; i < pcu_info->num_shader_engines; i++) {
> - for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
> - for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
> -
> - ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
> - pcu_info->cu_bitmap[0][i % 4][j + i / 4], ct,
> - cu_processor_id, k);
> -
> - if (ret < 0)
> - break;
> -
> - if (!ret) {
> - num_of_entries++;
> - list_add_tail(&props_ext->list, &dev->cache_props);
> + for (xcc = start; xcc < end; xcc++) {
> + for (i = 0; i < pcu_info->num_shader_engines; i++) {
> + for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
> + for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
> +
> + ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
> + pcu_info->cu_bitmap[xcc][i % 4][j + i / 4], ct,
> + cu_processor_id, k);
> +
> + if (ret < 0)
> + break;
> +
> + if (!ret) {
> + num_of_entries++;
> + list_add_tail(&props_ext->list, &dev->cache_props);
> + }
> +
> + /* Move to next CU block */
> + num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
> + pcu_info->num_cu_per_sh) ?
> + pcache_info[ct].num_cu_shared :
> + (pcu_info->num_cu_per_sh - k);
> + cu_processor_id += num_cu_shared;
> }
> -
> - /* Move to next CU block */
> - num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
> - pcu_info->num_cu_per_sh) ?
> - pcache_info[ct].num_cu_shared :
> - (pcu_info->num_cu_per_sh - k);
> - cu_processor_id += num_cu_shared;
> }
> }
> }
> } else {
> ret = fill_in_l2_l3_pcache(&props_ext, pcache_info,
> - pcu_info, ct, cu_processor_id);
> + pcu_info, ct, cu_processor_id, kdev);
>
> if (ret < 0)
> break;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> index dea32a9e5506..27386ce9a021 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> @@ -89,7 +89,7 @@ struct kfd_mem_properties {
> struct attribute attr;
> };
>
> -#define CACHE_SIBLINGMAP_SIZE 64
> +#define CACHE_SIBLINGMAP_SIZE 128
>
> struct kfd_cache_properties {
> struct list_head list;
More information about the amd-gfx
mailing list