[PATCH v3] drm/amdkfd: Fix bug in config_dequeue_wait_counts
Kim, Jonathan
Jonathan.Kim at amd.com
Sat Mar 15 03:41:38 UTC 2025
[Public]
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Harish
> Kasiviswanathan
> Sent: Friday, March 14, 2025 8:44 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan at amd.com>
> Subject: [PATCH v3] drm/amdkfd: Fix bug in config_dequeue_wait_counts
>
> For certain ASICs where dequeue_wait_count don't need to be initialized,
> pm_config_dequeue_wait_counts_v9 return without filling in the packet
> information. However, the calling function interprets this as a success
> and sends the uninitialized packet to firmware causing hang.
>
> Fix the above bug by not calling pm_config_dequeue_wait_counts_v9 for
> ASICs that don't need the value to be initialized.
>
> v2: Removed redudant code.
> Tidy up code based on review comments
> v3: Don't call pm_config_dequeue_wait_counts_v9 for certain ASICs
>
> Fixes: <98a5af8103f> ("drm/amdkfd: Add pm_config_dequeue_wait_counts API")
>
> Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan at amd.com>
Nit-picks below.
With those addressed and as long as this has been tested on optimized and unoptimized HW:
Reviewed-by: Jonathan Kim <jonathan.kim at amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_packet_manager.c | 16 ++++++----
> .../drm/amd/amdkfd/kfd_packet_manager_v9.c | 30 +++++++++++--------
> 2 files changed, 27 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 3f574d82b5fc..8a47b7259a10 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -418,6 +418,10 @@ int pm_config_dequeue_wait_counts(struct
> packet_manager *pm,
> !pm->pmf->config_dequeue_wait_counts_size)
> return 0;
>
> + if (cmd == KFD_DEQUEUE_WAIT_INIT && (KFD_GC_VERSION(pm->dqm-
> >dev) < IP_VERSION(9, 4, 1) ||
> + KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0)))
> + return 0;
> +
> size = pm->pmf->config_dequeue_wait_counts_size;
>
> mutex_lock(&pm->lock);
> @@ -436,16 +440,16 @@ int pm_config_dequeue_wait_counts(struct
> packet_manager *pm,
>
> retval = pm->pmf->config_dequeue_wait_counts(pm, buffer,
> cmd, value);
> - if (!retval)
> + if (!retval) {
> retval = kq_submit_packet(pm->priv_queue);
> +
> + /* If default value is modified, cache that in dqm->wait_times
> */
> + if (!retval && cmd == KFD_DEQUEUE_WAIT_INIT)
> + update_dqm_wait_times(pm->dqm);
> + }
> else
> kq_rollback_packet(pm->priv_queue);
Put else statement next to brace in line above and put braces after else statement to balance the if statement braces.
> }
> -
> - /* If default value is modified, cache that value in dqm->wait_times */
> - if (!retval && cmd == KFD_DEQUEUE_WAIT_INIT)
> - update_dqm_wait_times(pm->dqm);
> -
> out:
> mutex_unlock(&pm->lock);
> return retval;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> index d440df602393..f059041902bc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
> @@ -310,6 +310,13 @@ static inline void
> pm_build_dequeue_wait_counts_packet_info(struct packet_manage
> reg_data);
> }
>
> +/* pm_config_dequeue_wait_counts_v9: Builds WRITE_DATA packet with
> + * register/value for configuring dequeue wait counts
> + *
> + * @return: -ve for failure and 0 for success and buffer is
> + * filled in with packet
> + *
> + **/
> static int pm_config_dequeue_wait_counts_v9(struct packet_manager *pm,
> uint32_t *buffer,
> enum kfd_config_dequeue_wait_counts_cmd cmd,
> @@ -321,24 +328,21 @@ static int pm_config_dequeue_wait_counts_v9(struct
> packet_manager *pm,
>
> switch (cmd) {
> case KFD_DEQUEUE_WAIT_INIT: {
> - uint32_t sch_wave = 0, que_sleep = 0;
> - /* Reduce CP_IQ_WAIT_TIME2.QUE_SLEEP to 0x1 from default
> 0x40.
> + uint32_t sch_wave = 0, que_sleep = 1;
Do the following here (start of case WAIT_INIT) for safety to explicitly state certain devices are not permitted to do WAIT_INIT since you're unconditionally setting que_sleep to 1:
if (KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(9, 4, 1) ||
KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0))
return -EPERM;
Jon
> +
> + /* For all gfx9 ASICs > gfx941,
> + * Reduce CP_IQ_WAIT_TIME2.QUE_SLEEP to 0x1 from default
> 0x40.
> * On a 1GHz machine this is roughly 1 microsecond, which is
> * about how long it takes to load data out of memory during
> * queue connect
> * QUE_SLEEP: Wait Count for Dequeue Retry.
> + *
> + * Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU
> */
> - if (KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(9, 4, 1) &&
> - KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(10, 0, 0)) {
> - que_sleep = 1;
> -
> - /* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU
> */
> - if (amdgpu_emu_mode == 0 && pm->dqm->dev->adev-
> >gmc.is_app_apu &&
> - (KFD_GC_VERSION(pm->dqm->dev) == IP_VERSION(9, 4,
> 3)))
> - sch_wave = 1;
> - } else {
> - return 0;
> - }
> + if (amdgpu_emu_mode == 0 && pm->dqm->dev->adev-
> >gmc.is_app_apu &&
> + (KFD_GC_VERSION(pm->dqm->dev) == IP_VERSION(9, 4, 3)))
> + sch_wave = 1;
> +
> pm_build_dequeue_wait_counts_packet_info(pm, sch_wave,
> que_sleep,
> ®_offset, ®_data);
>
> --
> 2.34.1
More information about the amd-gfx
mailing list