[v4 1/7] drm/amd/amdgpu: Simplify SDMA reset mechanism by removing dynamic callbacks

Fri Apr 11 02:58:08 UTC 2025

On Tue, Apr 8, 2025 at 4:47 AM Jesse.zhang at amd.com <jesse.zhang at amd.com> wrote:
>
> Since KFD no longer registers its own callbacks for SDMA resets, and only KGD uses the reset mechanism,
> we can simplify the SDMA reset flow by directly calling the ring's `stop_queue` and `start_queue` functions.
> This patch removes the dynamic callback mechanism and prepares for its eventual deprecation.
>
> 1. **Remove Dynamic Callbacks**:
>    - The `pre_reset` and `post_reset` callback invocations in `amdgpu_sdma_reset_engine` are removed.
>    - Instead, the ring's `stop_queue` and `start_queue` functions are called directly during the reset process.
>
> 2. **Prepare for Deprecation of Dynamic Mechanism**:
>    - By removing the callback invocations, this patch prepares the codebase for the eventual removal of the dynamic callback registration mechanism.
>
> v2: Update stop_queue/start_queue function paramters to use ring pointer instead of device/instance(Christian)
> v3: The general coding style is to declare variables like "i" or "r" last. E.g. longest lines first and short lasts. (Chritian)
> v4: move stop_queue/start_queue to struct amdgpu_sdma_instance and rename them. (Alex)
>
> Signed-off-by: Jesse Zhang <Jesse.Zhang at amd.com>

These patches look good, but I would rework the ordering a bit.  I'd
suggest splitting them up like so:
1. Add the new sdma function pointers (stop_kernel_queue,
start_kernel_queue, soft_reset) for amdgpu_sdma.h
2. Register the new sdma function pointers (stop_kernel_queue,
start_kernel_queue, soft_reset) for each sdma IP version that needs
them
3. Switch amdgpu_sdma_reset_engine() to use the new sdma function pointers
4. optimize queue reset and stop logic patches
5. implement soft reset directly for sdma 5 patches
6. remove old sdma reset callback mechanism

Alex

Alex

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 54 ++++++------------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  7 ++-
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 22 ++++++----
>  3 files changed, 34 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 0a9893fee828..541b349e0310 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -558,20 +558,14 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
>   * @adev: Pointer to the AMDGPU device
>   * @instance_id: ID of the SDMA engine instance to reset
>   *
> - * This function performs the following steps:
> - * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save their state.
> - * 2. Resets the specified SDMA engine instance.
> - * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to restore their state.
> - *
>   * Returns: 0 on success, or a negative error code on failure.
>   */
>  int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
>  {
> -       struct sdma_on_reset_funcs *funcs;
>         int ret = 0;
>         struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id];
> -       struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
> -       struct amdgpu_ring *page_ring = &sdma_instance->page;
> +       struct amdgpu_ring *sdma_gfx_ring = &sdma_instance->ring;
> +       struct amdgpu_ring *sdma_page_ring = &sdma_instance->page;
>         bool gfx_sched_stopped = false, page_sched_stopped = false;
>
>         mutex_lock(&sdma_instance->engine_reset_mutex);
> @@ -579,28 +573,18 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
>         * This ensures that no new tasks are submitted to the queues while
>         * the reset is in progress.
>         */
> -       if (!amdgpu_ring_sched_ready(gfx_ring)) {
> -               drm_sched_wqueue_stop(&gfx_ring->sched);
> +       if (!amdgpu_ring_sched_ready(sdma_gfx_ring)) {
> +               drm_sched_wqueue_stop(&sdma_gfx_ring->sched);
>                 gfx_sched_stopped = true;
>         }
>
> -       if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
> -               drm_sched_wqueue_stop(&page_ring->sched);
> +       if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(sdma_page_ring)) {
> +               drm_sched_wqueue_stop(&sdma_page_ring->sched);
>                 page_sched_stopped = true;
>         }
>
> -       /* Invoke all registered pre_reset callbacks */
> -       list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> -               if (funcs->pre_reset) {
> -                       ret = funcs->pre_reset(adev, instance_id);
> -                       if (ret) {
> -                               dev_err(adev->dev,
> -                               "beforeReset callback failed for instance %u: %d\n",
> -                                       instance_id, ret);
> -                               goto exit;
> -                       }
> -               }
> -       }
> +       if (sdma_instance->funcs->stop_kernel_queue)
> +               sdma_instance->funcs->stop_kernel_queue(sdma_gfx_ring);
>
>         /* Perform the SDMA reset for the specified instance */
>         ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
> @@ -609,18 +593,8 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
>                 goto exit;
>         }
>
> -       /* Invoke all registered post_reset callbacks */
> -       list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> -               if (funcs->post_reset) {
> -                       ret = funcs->post_reset(adev, instance_id);
> -                       if (ret) {
> -                               dev_err(adev->dev,
> -                               "afterReset callback failed for instance %u: %d\n",
> -                                       instance_id, ret);
> -                               goto exit;
> -                       }
> -               }
> -       }
> +       if (sdma_instance->funcs->start_kernel_queue)
> +               sdma_instance->funcs->start_kernel_queue(sdma_gfx_ring);
>
>  exit:
>         /* Restart the scheduler's work queue for the GFX and page rings
> @@ -628,11 +602,11 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
>          * to be submitted to the queues after the reset is complete.
>          */
>         if (!ret) {
> -               if (gfx_sched_stopped && amdgpu_ring_sched_ready(gfx_ring)) {
> -                       drm_sched_wqueue_start(&gfx_ring->sched);
> +               if (gfx_sched_stopped && amdgpu_ring_sched_ready(sdma_gfx_ring)) {
> +                       drm_sched_wqueue_start(&sdma_gfx_ring->sched);
>                 }
> -               if (page_sched_stopped && amdgpu_ring_sched_ready(page_ring)) {
> -                       drm_sched_wqueue_start(&page_ring->sched);
> +               if (page_sched_stopped && amdgpu_ring_sched_ready(sdma_page_ring)) {
> +                       drm_sched_wqueue_start(&sdma_page_ring->sched);
>                 }
>         }
>         mutex_unlock(&sdma_instance->engine_reset_mutex);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 47d56fd0589f..620fd7663526 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -50,6 +50,11 @@ enum amdgpu_sdma_irq {
>
>  #define NUM_SDMA(x) hweight32(x)
>
> +struct amdgpu_sdma_funcs {
> +       int (*stop_kernel_queue)(struct amdgpu_ring *ring);
> +       int (*start_kernel_queue)(struct amdgpu_ring *ring);
> +};
> +
>  struct amdgpu_sdma_instance {
>         /* SDMA firmware */
>         const struct firmware   *fw;
> @@ -68,7 +73,7 @@ struct amdgpu_sdma_instance {
>         /* track guilty state of GFX and PAGE queues */
>         bool                    gfx_guilty;
>         bool                    page_guilty;
> -
> +       const struct amdgpu_sdma_funcs   *funcs;
>  };
>
>  enum amdgpu_sdma_ras_memory_id {
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 688a720bbbbd..c663c63485f4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -108,6 +108,8 @@ static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev);
>  static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev);
>  static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev);
>  static void sdma_v4_4_2_update_reset_mask(struct amdgpu_device *adev);
> +static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring);
> +static int sdma_v4_4_2_restore_queue(struct amdgpu_ring *ring);
>
>  static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev,
>                 u32 instance, u32 offset)
> @@ -1333,6 +1335,11 @@ static bool sdma_v4_4_2_fw_support_paging_queue(struct amdgpu_device *adev)
>         }
>  }
>
> +static const struct amdgpu_sdma_funcs sdma_v4_4_2_sdma_funcs = {
> +       .stop_kernel_queue = &sdma_v4_4_2_stop_queue,
> +       .start_kernel_queue = &sdma_v4_4_2_restore_queue,
> +};
> +
>  static int sdma_v4_4_2_early_init(struct amdgpu_ip_block *ip_block)
>  {
>         struct amdgpu_device *adev = ip_block->adev;
> @@ -1352,7 +1359,6 @@ static int sdma_v4_4_2_early_init(struct amdgpu_ip_block *ip_block)
>         sdma_v4_4_2_set_irq_funcs(adev);
>         sdma_v4_4_2_set_ras_funcs(adev);
>         sdma_v4_4_2_set_engine_reset_funcs(adev);
> -
>         return 0;
>  }
>
> @@ -1447,6 +1453,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
>                 /* Initialize guilty flags for GFX and PAGE queues */
>                 adev->sdma.instance[i].gfx_guilty = false;
>                 adev->sdma.instance[i].page_guilty = false;
> +               adev->sdma.instance[i].funcs = &sdma_v4_4_2_sdma_funcs;
>
>                 ring = &adev->sdma.instance[i].ring;
>                 ring->ring_obj = NULL;
> @@ -1678,11 +1685,12 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
>         return r;
>  }
>
> -static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_id)
> +static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring)
>  {
> +       struct amdgpu_device *adev = ring->adev;
> +       u32 instance_id = GET_INST(SDMA0, ring->me);
>         u32 inst_mask;
>         uint64_t rptr;
> -       struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
>
>         if (amdgpu_sriov_vf(adev))
>                 return -EINVAL;
> @@ -1715,11 +1723,11 @@ static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_
>         return 0;
>  }
>
> -static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t instance_id)
> +static int sdma_v4_4_2_restore_queue(struct amdgpu_ring *ring)
>  {
> -       int i;
> +       struct amdgpu_device *adev = ring->adev;
>         u32 inst_mask;
> -       struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
> +       int i;
>
>         inst_mask = 1 << ring->me;
>         udelay(50);
> @@ -1740,8 +1748,6 @@ static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t instan
>  }
>
>  static struct sdma_on_reset_funcs sdma_v4_4_2_engine_reset_funcs = {
> -       .pre_reset = sdma_v4_4_2_stop_queue,
> -       .post_reset = sdma_v4_4_2_restore_queue,
>  };
>
>  static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev)
> --
> 2.25.1
>