[PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3
Joshi, Mukul
Mukul.Joshi at amd.com
Thu Sep 19 21:06:26 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: Kasiviswanathan, Harish <Harish.Kasiviswanathan at amd.com>
> Sent: Thursday, September 19, 2024 3:56 PM
> To: Joshi, Mukul <Mukul.Joshi at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: RE: [PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> This looks like a more robust way to get this information. Few comments
> inline.
>
> - It might be better to break this into two commits. One specific to 9.4.3 and
> other to use doorbell instead of LUT register. The second change affects all gfx9
> ASICs.
>
To make the changes work for GFX 9.4.3, I will have to add additional changes to access
the LUT register correctly, which will then be removed in the subsequent patch for all gfx9
ASICs. So I don't think we gain much from breaking it into 2 commits.
What do you think?
> -----Original Message-----
> From: Joshi, Mukul <Mukul.Joshi at amd.com>
> Sent: Thursday, September 19, 2024 1:43 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan at amd.com>; Joshi,
> Mukul <Mukul.Joshi at amd.com>
> Subject: [PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3
>
> Make CU occupancy work on GFX 9.4.3 by updating the code to handle
> multiple XCCs correctly. Additionally, update the algorithm to calculate CU
> occupancy by matching dorrbell offset of the queue against the process's
> queues, instead of using the IH_VMID_X_LUT registers since that can be racy
> with CP updating the VMID-PASID mapping.
>
> Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
> ---
> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 92 ++++++++---------
> -- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 5 +-
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 ++++
> .../drm/amd/amdkfd/kfd_device_queue_manager.h | 3 +
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 18 +++-
> .../gpu/drm/amd/include/kgd_kfd_interface.h | 10 +-
> 6 files changed, 88 insertions(+), 60 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 1254a43ec96b..b8c01257b101 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct
> amdgpu_device *adev)
> * @inst: xcc's instance number on a multi-XCC setup
> */
> static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
> - int *wave_cnt, int *vmid, uint32_t inst)
> + struct kfd_cu_occupancy *queue_cnt, uint32_t inst)
> {
> int pipe_idx;
> int queue_slot;
> unsigned int reg_val;
> -
> + unsigned int wave_cnt;
> /*
> * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
> * parameters to read out waves in flight. Get VMID if there are
> * non-zero waves in flight.
> */
> - *vmid = 0xFF;
> - *wave_cnt = 0;
> pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
> queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
> - soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst);
> - reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst,
> mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
> - queue_slot);
> - *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
> - if (*wave_cnt != 0)
> - *vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) &
> - CP_HQD_VMID__VMID_MASK) >>
> CP_HQD_VMID__VMID__SHIFT;
> + soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst));
> + reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC,
> inst),
> + mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
> + wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
> + if (wave_cnt != 0) {
>
> Why is it queue_cnt->wave_cnt accumulating instead of direct assignment.
The wave_cnt read here is per SE. So, for a queue, we need to accumulate the wave_cnts
across all SEs.
> There is some inefficiency here. The code is iterating over all queues two times
> once in kfd_get_cu_occupancy and once more in
> kgd_gfx_v9_get_cu_occupancy(). Is that strictly required?
>
Yes. The first pass (in kgd_gfx_v9_get_cu_occupancy()) is accumulating all queues (all HQDs) which have wave_cnt !=0.
At this step, we store the corresponding doorbell_offset of the queues which have wave_cnt != 0.
In the second step in kfd_get_cu_occupancy(), we are checking if queues with wave_cnt != 0 belong
to the process. If yes, then we update the process's total wave_cnt.
Hope it makes it clear why we need the 2 passes.
>
> + queue_cnt->wave_cnt += wave_cnt;
> + queue_cnt->doorbell_off =
> + (RREG32_SOC15(GC, GET_INST(GC, inst),
> mmCP_HQD_PQ_DOORBELL_CONTROL) &
> +
> CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
> +
> CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
> + }
> }
>
> /**
> @@ -1021,24 +1023,19 @@ static void get_wave_count(struct
> amdgpu_device *adev, int queue_idx,
> *
> * Reading registers referenced above involves programming GRBM
> appropriately
> */
> -void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
> - int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst)
> +void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
> + struct kfd_cu_occupancy *cu_occupancy,
> + int *max_waves_per_cu, uint32_t inst)
> {
> int qidx;
> - int vmid;
> int se_idx;
> - int sh_idx;
> int se_cnt;
> - int sh_cnt;
> - int wave_cnt;
> int queue_map;
> - int pasid_tmp;
> int max_queue_cnt;
> - int vmid_wave_cnt = 0;
> DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);
>
> lock_spi_csq_mutexes(adev);
> - soc15_grbm_select(adev, 1, 0, 0, 0, inst);
> + soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst));
>
> /*
> * Iterate through the shader engines and arrays of the device @@ -
> 1048,51 +1045,38 @@ void kgd_gfx_v9_get_cu_occupancy(struct
> amdgpu_device *adev, int pasid,
> AMDGPU_MAX_QUEUES);
> max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
> adev->gfx.mec.num_queue_per_pipe;
> - sh_cnt = adev->gfx.config.max_sh_per_se;
> se_cnt = adev->gfx.config.max_shader_engines;
> for (se_idx = 0; se_idx < se_cnt; se_idx++) {
> - for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
> + amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
> + queue_map = RREG32_SOC15(GC, GET_INST(GC, inst),
> + mmSPI_CSQ_WF_ACTIVE_STATUS);
> +
> + /*
> + * Assumption: queue map encodes following schema: four
> + * pipes per each micro-engine, with each pipe mapping
> + * eight queues. This schema is true for GFX9 devices
> + * and must be verified for newer device families
> + */
> + for (qidx = 0; qidx < max_queue_cnt; qidx++) {
> + /* Skip qeueus that are not associated with
> + * compute functions
> + */
> + if (!test_bit(qidx, cp_queue_bitmap))
> + continue;
>
> - amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst);
> - queue_map = RREG32_SOC15(GC, inst,
> mmSPI_CSQ_WF_ACTIVE_STATUS);
> + if (!(queue_map & (1 << qidx)))
> + continue;
>
> - /*
> - * Assumption: queue map encodes following schema: four
> - * pipes per each micro-engine, with each pipe mapping
> - * eight queues. This schema is true for GFX9 devices
> - * and must be verified for newer device families
> - */
> - for (qidx = 0; qidx < max_queue_cnt; qidx++) {
> -
> - /* Skip qeueus that are not associated with
> - * compute functions
> - */
> - if (!test_bit(qidx, cp_queue_bitmap))
> - continue;
> -
> - if (!(queue_map & (1 << qidx)))
> - continue;
> -
> - /* Get number of waves in flight and aggregate them */
> - get_wave_count(adev, qidx, &wave_cnt, &vmid,
> - inst);
> - if (wave_cnt != 0) {
> - pasid_tmp =
> - RREG32(SOC15_REG_OFFSET(OSSSYS, inst,
> - mmIH_VMID_0_LUT) + vmid);
> - if (pasid_tmp == pasid)
> - vmid_wave_cnt += wave_cnt;
> - }
> - }
> + /* Get number of waves in flight and aggregate them */
> + get_wave_count(adev, qidx, &cu_occupancy[qidx],
> + inst);
> }
> }
>
> amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst);
> - soc15_grbm_select(adev, 0, 0, 0, 0, inst);
> + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst));
> unlock_spi_csq_mutexes(adev);
>
> /* Update the output parameters and return */
> - *pasid_wave_cnt = vmid_wave_cnt;
> *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
> adev->gfx.cu_info.max_waves_per_simd;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index 988c50ac3be0..b6a91a552aa4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -52,8 +52,9 @@ bool
> kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
> uint8_t vmid, uint16_t *p_pasid); void
> kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
> uint32_t vmid, uint64_t page_table_base); -void
> kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
> - int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst);
> +void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
> + struct kfd_cu_occupancy *cu_occupancy,
> + int *max_waves_per_cu, uint32_t inst);
> void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device
> *adev,
> uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
> uint32_t inst);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 71b465f8d83e..784d155d8bcc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -3540,6 +3540,26 @@ int debug_refresh_runlist(struct
> device_queue_manager *dqm)
> return debug_map_and_unlock(dqm); }
>
> +bool kfd_dqm_find_queue_by_doorbell_off(struct device_queue_manager
> *dqm,
> + struct qcm_process_device *qpd,
> + int doorbell_off) {
> + struct queue *q;
> + bool r = false;
> +
> + dqm_lock(dqm);
> +
> + list_for_each_entry(q, &qpd->queues_list, list) {
> + if (q->properties.doorbell_off == doorbell_off) {
> + r = true;
> + goto out;
> + }
> + }
> +
> +out:
> + dqm_unlock(dqm);
> + return r;
> +}
> #if defined(CONFIG_DEBUG_FS)
>
> static void seq_reg_dump(struct seq_file *m, diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 08b40826ad1e..e5951589e5bd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -324,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q, int
> debug_lock_and_unmap(struct device_queue_manager *dqm); int
> debug_map_and_unlock(struct device_queue_manager *dqm); int
> debug_refresh_runlist(struct device_queue_manager *dqm);
> +bool kfd_dqm_find_queue_by_doorbell_off(struct device_queue_manager
> *dqm,
> + struct qcm_process_device *qpd,
> + int doorbell_off);
>
> static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device
> *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index a902950cc060..6720bd30517b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -270,6 +270,10 @@ static int kfd_get_cu_occupancy(struct attribute
> *attr, char *buffer)
> struct kfd_node *dev = NULL;
> struct kfd_process *proc = NULL;
> struct kfd_process_device *pdd = NULL;
> + int i;
> + struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES];
> +
>
> You can just use sizeof(cu_occupancy) as it is an array
>
> + memset(cu_occupancy, 0x0, sizeof(struct kfd_cu_occupancy) *
> + AMDGPU_MAX_QUEUES);
>
>
>
> pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy);
> dev = pdd->dev;
> @@ -287,9 +291,19 @@ static int kfd_get_cu_occupancy(struct attribute
> *attr, char *buffer)
> /* Collect wave count from device if it supports */
> wave_cnt = 0;
> max_waves_per_cu = 0;
> - dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt,
> - &max_waves_per_cu, 0);
>
>
> A comment as to why you are using only the first xcc_mask will be helpful.
>
Sure will do.
Thanks,
Mukul
> + dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
> + &max_waves_per_cu, ffs(dev->xcc_mask) - 1);
> +
> + for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
> + if (cu_occupancy[i].wave_cnt != 0 &&
> + kfd_dqm_find_queue_by_doorbell_off(dev->dqm, &pdd->qpd,
> + cu_occupancy[i].doorbell_off))
> + wave_cnt += cu_occupancy[i].wave_cnt;
> + }
> +
> + /* Update wave_cnt for the number of XCCs in the partition */
> + wave_cnt *= NUM_XCC(dev->xcc_mask);
> /* Translate wave count to number of compute units */
> cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;
> return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); diff --git
> a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> index 7744ca3ef4b1..e3e635a31b8a 100644
> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> @@ -71,6 +71,11 @@ enum kgd_memory_pool {
> KGD_POOL_FRAMEBUFFER = 3,
> };
>
> +struct kfd_cu_occupancy {
> + u32 wave_cnt;
> + u32 doorbell_off;
> +};
> +
> /**
> * enum kfd_sched_policy
> *
> @@ -313,8 +318,9 @@ struct kfd2kgd_calls {
> uint32_t grace_period,
> uint32_t *reg_offset,
> uint32_t *reg_data);
> - void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
> - int *wave_cnt, int *max_waves_per_cu, uint32_t inst);
> + void (*get_cu_occupancy)(struct amdgpu_device *adev,
> + struct kfd_cu_occupancy *cu_occupancy,
> + int *max_waves_per_cu, uint32_t inst);
> void (*program_trap_handler_settings)(struct amdgpu_device *adev,
> uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
> uint32_t inst);
> --
> 2.35.1
>
More information about the amd-gfx
mailing list