[PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3

Thu Sep 19 21:06:26 UTC 2024

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Kasiviswanathan, Harish <Harish.Kasiviswanathan at amd.com>
> Sent: Thursday, September 19, 2024 3:56 PM
> To: Joshi, Mukul <Mukul.Joshi at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: RE: [PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> This looks like a more robust way to get this information. Few comments
> inline.
>
> - It might be better to break this into two commits. One specific to 9.4.3 and
> other to use doorbell instead of LUT register. The second change affects all gfx9
> ASICs.
>
To make the changes work for GFX 9.4.3, I will have to add additional changes to access
the LUT register correctly, which will then be removed in the subsequent patch for all gfx9
ASICs. So I don't think we gain much from breaking it into 2 commits.
What do you think?

> -----Original Message-----
> From: Joshi, Mukul <Mukul.Joshi at amd.com>
> Sent: Thursday, September 19, 2024 1:43 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan at amd.com>; Joshi,
> Mukul <Mukul.Joshi at amd.com>
> Subject: [PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3
>
> Make CU occupancy work on GFX 9.4.3 by updating the code to handle
> multiple XCCs correctly. Additionally, update the algorithm to calculate CU
> occupancy by matching dorrbell offset of the queue against the process's
> queues, instead of using the IH_VMID_X_LUT registers since that can be racy
> with CP updating the VMID-PASID mapping.
>
> Signed-off-by: Mukul Joshi <mukul.joshi at amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 92 ++++++++---------
> --  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  5 +-
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 ++++
> .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 +
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 18 +++-
>  .../gpu/drm/amd/include/kgd_kfd_interface.h   | 10 +-
>  6 files changed, 88 insertions(+), 60 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 1254a43ec96b..b8c01257b101 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct
> amdgpu_device *adev)
>   * @inst: xcc's instance number on a multi-XCC setup
>   */
>  static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
> -               int *wave_cnt, int *vmid, uint32_t inst)
> +               struct kfd_cu_occupancy *queue_cnt, uint32_t inst)
>  {
>         int pipe_idx;
>         int queue_slot;
>         unsigned int reg_val;
> -
> +       unsigned int wave_cnt;
>         /*
>          * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
>          * parameters to read out waves in flight. Get VMID if there are
>          * non-zero waves in flight.
>          */
> -       *vmid = 0xFF;
> -       *wave_cnt = 0;
>         pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
>         queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
> -       soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst);
> -       reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst,
> mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
> -                        queue_slot);
> -       *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
> -       if (*wave_cnt != 0)
> -               *vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) &
> -                        CP_HQD_VMID__VMID_MASK) >>
> CP_HQD_VMID__VMID__SHIFT;
> +       soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst));
> +       reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC,
> inst),
> +                                 mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
> +       wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
> +       if (wave_cnt != 0) {
>
> Why is it queue_cnt->wave_cnt accumulating instead of direct assignment.

The wave_cnt read here is per SE. So, for a queue, we need to accumulate the wave_cnts
across all SEs.

> There is some inefficiency here. The code is iterating over all queues two times
> once in kfd_get_cu_occupancy and once more in
> kgd_gfx_v9_get_cu_occupancy(). Is that strictly required?
>
Yes. The first pass (in kgd_gfx_v9_get_cu_occupancy()) is accumulating all queues (all HQDs) which have wave_cnt !=0.
At this step, we store the corresponding doorbell_offset of the queues which have wave_cnt != 0.
In the second step in kfd_get_cu_occupancy(), we are checking if queues with wave_cnt != 0 belong
to the process. If yes, then we update the process's total wave_cnt.
Hope it makes it clear why we need the 2 passes.

>
> +               queue_cnt->wave_cnt += wave_cnt;
> +               queue_cnt->doorbell_off =
> +                       (RREG32_SOC15(GC, GET_INST(GC, inst),
> mmCP_HQD_PQ_DOORBELL_CONTROL) &
> +
> CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
> +
> CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
> +       }
>  }
>
>  /**
> @@ -1021,24 +1023,19 @@ static void get_wave_count(struct
> amdgpu_device *adev, int queue_idx,
>   *
>   *  Reading registers referenced above involves programming GRBM
> appropriately
>   */
> -void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
> -               int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst)
> +void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
> +                                struct kfd_cu_occupancy *cu_occupancy,
> +                                int *max_waves_per_cu, uint32_t inst)
>  {
>         int qidx;
> -       int vmid;
>         int se_idx;
> -       int sh_idx;
>         int se_cnt;
> -       int sh_cnt;
> -       int wave_cnt;
>         int queue_map;
> -       int pasid_tmp;
>         int max_queue_cnt;
> -       int vmid_wave_cnt = 0;
>         DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);
>
>         lock_spi_csq_mutexes(adev);
> -       soc15_grbm_select(adev, 1, 0, 0, 0, inst);
> +       soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst));
>
>         /*
>          * Iterate through the shader engines and arrays of the device @@ -
> 1048,51 +1045,38 @@ void kgd_gfx_v9_get_cu_occupancy(struct
> amdgpu_device *adev, int pasid,
>                           AMDGPU_MAX_QUEUES);
>         max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
>                         adev->gfx.mec.num_queue_per_pipe;
> -       sh_cnt = adev->gfx.config.max_sh_per_se;
>         se_cnt = adev->gfx.config.max_shader_engines;
>         for (se_idx = 0; se_idx < se_cnt; se_idx++) {
> -               for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
> +               amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
> +               queue_map = RREG32_SOC15(GC, GET_INST(GC, inst),
> + mmSPI_CSQ_WF_ACTIVE_STATUS);
> +
> +               /*
> +                * Assumption: queue map encodes following schema: four
> +                * pipes per each micro-engine, with each pipe mapping
> +                * eight queues. This schema is true for GFX9 devices
> +                * and must be verified for newer device families
> +                */
> +               for (qidx = 0; qidx < max_queue_cnt; qidx++) {
> +                       /* Skip qeueus that are not associated with
> +                        * compute functions
> +                        */
> +                       if (!test_bit(qidx, cp_queue_bitmap))
> +                               continue;
>
> -                       amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst);
> -                       queue_map = RREG32_SOC15(GC, inst,
> mmSPI_CSQ_WF_ACTIVE_STATUS);
> +                       if (!(queue_map & (1 << qidx)))
> +                               continue;
>
> -                       /*
> -                        * Assumption: queue map encodes following schema: four
> -                        * pipes per each micro-engine, with each pipe mapping
> -                        * eight queues. This schema is true for GFX9 devices
> -                        * and must be verified for newer device families
> -                        */
> -                       for (qidx = 0; qidx < max_queue_cnt; qidx++) {
> -
> -                               /* Skip qeueus that are not associated with
> -                                * compute functions
> -                                */
> -                               if (!test_bit(qidx, cp_queue_bitmap))
> -                                       continue;
> -
> -                               if (!(queue_map & (1 << qidx)))
> -                                       continue;
> -
> -                               /* Get number of waves in flight and aggregate them */
> -                               get_wave_count(adev, qidx, &wave_cnt, &vmid,
> -                                               inst);
> -                               if (wave_cnt != 0) {
> -                                       pasid_tmp =
> -                                         RREG32(SOC15_REG_OFFSET(OSSSYS, inst,
> -                                                mmIH_VMID_0_LUT) + vmid);
> -                                       if (pasid_tmp == pasid)
> -                                               vmid_wave_cnt += wave_cnt;
> -                               }
> -                       }
> +                       /* Get number of waves in flight and aggregate them */
> +                       get_wave_count(adev, qidx, &cu_occupancy[qidx],
> +                                       inst);
>                 }
>         }
>
>         amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst);
> -       soc15_grbm_select(adev, 0, 0, 0, 0, inst);
> +       soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst));
>         unlock_spi_csq_mutexes(adev);
>
>         /* Update the output parameters and return */
> -       *pasid_wave_cnt = vmid_wave_cnt;
>         *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
>                                 adev->gfx.cu_info.max_waves_per_simd;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> index 988c50ac3be0..b6a91a552aa4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> @@ -52,8 +52,9 @@ bool
> kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
>                                         uint8_t vmid, uint16_t *p_pasid);  void
> kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
>                         uint32_t vmid, uint64_t page_table_base); -void
> kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
> -               int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst);
> +void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
> +                                struct kfd_cu_occupancy *cu_occupancy,
> +                                int *max_waves_per_cu, uint32_t inst);
>  void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device
> *adev,
>                 uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
>                 uint32_t inst);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 71b465f8d83e..784d155d8bcc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -3540,6 +3540,26 @@ int debug_refresh_runlist(struct
> device_queue_manager *dqm)
>         return debug_map_and_unlock(dqm);  }
>
> +bool kfd_dqm_find_queue_by_doorbell_off(struct device_queue_manager
> *dqm,
> +                                       struct qcm_process_device *qpd,
> +                                       int doorbell_off) {
> +       struct queue *q;
> +       bool r = false;
> +
> +       dqm_lock(dqm);
> +
> +       list_for_each_entry(q, &qpd->queues_list, list) {
> +               if (q->properties.doorbell_off == doorbell_off) {
> +                       r = true;
> +                       goto out;
> +               }
> +       }
> +
> +out:
> +       dqm_unlock(dqm);
> +       return r;
> +}
>  #if defined(CONFIG_DEBUG_FS)
>
>  static void seq_reg_dump(struct seq_file *m, diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index 08b40826ad1e..e5951589e5bd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -324,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q,  int
> debug_lock_and_unmap(struct device_queue_manager *dqm);  int
> debug_map_and_unlock(struct device_queue_manager *dqm);  int
> debug_refresh_runlist(struct device_queue_manager *dqm);
> +bool kfd_dqm_find_queue_by_doorbell_off(struct device_queue_manager
> *dqm,
> +                                       struct qcm_process_device *qpd,
> +                                       int doorbell_off);
>
>  static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device
> *pdd)  { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index a902950cc060..6720bd30517b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -270,6 +270,10 @@ static int kfd_get_cu_occupancy(struct attribute
> *attr, char *buffer)
>         struct kfd_node *dev = NULL;
>         struct kfd_process *proc = NULL;
>         struct kfd_process_device *pdd = NULL;
> +       int i;
> +       struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES];
> +
>
> You can just use sizeof(cu_occupancy) as it is an array
>
> +       memset(cu_occupancy, 0x0, sizeof(struct kfd_cu_occupancy) *
> + AMDGPU_MAX_QUEUES);
>
>
>
>         pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy);
>         dev = pdd->dev;
> @@ -287,9 +291,19 @@ static int kfd_get_cu_occupancy(struct attribute
> *attr, char *buffer)
>         /* Collect wave count from device if it supports */
>         wave_cnt = 0;
>         max_waves_per_cu = 0;
> -       dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt,
> -                       &max_waves_per_cu, 0);
>
>
> A comment as to why you are using only the first xcc_mask will be helpful.
>
Sure will do.

Thanks,
Mukul

> +       dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
> +                       &max_waves_per_cu, ffs(dev->xcc_mask) - 1);
> +
> +       for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
> +               if (cu_occupancy[i].wave_cnt != 0 &&
> +                   kfd_dqm_find_queue_by_doorbell_off(dev->dqm, &pdd->qpd,
> +                                               cu_occupancy[i].doorbell_off))
> +                       wave_cnt += cu_occupancy[i].wave_cnt;
> +       }
> +
> +       /* Update wave_cnt for the number of XCCs in the partition */
> +       wave_cnt *= NUM_XCC(dev->xcc_mask);
>         /* Translate wave count to number of compute units */
>         cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;
>         return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); diff --git
> a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> index 7744ca3ef4b1..e3e635a31b8a 100644
> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
> @@ -71,6 +71,11 @@ enum kgd_memory_pool {
>         KGD_POOL_FRAMEBUFFER = 3,
>  };
>
> +struct kfd_cu_occupancy {
> +       u32 wave_cnt;
> +       u32 doorbell_off;
> +};
> +
>  /**
>   * enum kfd_sched_policy
>   *
> @@ -313,8 +318,9 @@ struct kfd2kgd_calls {
>                         uint32_t grace_period,
>                         uint32_t *reg_offset,
>                         uint32_t *reg_data);
> -       void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
> -                       int *wave_cnt, int *max_waves_per_cu, uint32_t inst);
> +       void (*get_cu_occupancy)(struct amdgpu_device *adev,
> +                                struct kfd_cu_occupancy *cu_occupancy,
> +                                int *max_waves_per_cu, uint32_t inst);
>         void (*program_trap_handler_settings)(struct amdgpu_device *adev,
>                         uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
>                         uint32_t inst);
> --
> 2.35.1
>