[PATCH] drm/amdgpu: Fix the warning info in mode1 reset

vitaly prosyak vprosyak at amd.com
Mon Jan 29 19:28:41 UTC 2024


The change looks good to me.
Validated now on kernel 5.7 ( branch origin/adeucher/amd-staging-drm-next)


Thanks,  Vitaly


On 2024-01-29 10:33, Deucher, Alexander wrote:
>
> [AMD Official Use Only - General]
>
>
> Ping?
>
> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> *From:* Deucher, Alexander <Alexander.Deucher at amd.com>
> *Sent:* Thursday, January 25, 2024 11:15 AM
> *To:* amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>
> *Cc:* Ma, Jun <Jun.Ma2 at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; Prosyak, Vitaly <Vitaly.Prosyak at amd.com>
> *Subject:* [PATCH] drm/amdgpu: Fix the warning info in mode1 reset
>  
> From: Ma Jun <Jun.Ma2 at amd.com>
>
> Fix the warning info below during mode1 reset.
> [  +0.000004] Call Trace:
> [  +0.000004]  <TASK>
> [  +0.000006]  ? show_regs+0x6e/0x80
> [  +0.000011]  ? __flush_work.isra.0+0x2e8/0x390
> [  +0.000005]  ? __warn+0x91/0x150
> [  +0.000009]  ? __flush_work.isra.0+0x2e8/0x390
> [  +0.000006]  ? report_bug+0x19d/0x1b0
> [  +0.000013]  ? handle_bug+0x46/0x80
> [  +0.000012]  ? exc_invalid_op+0x1d/0x80
> [  +0.000011]  ? asm_exc_invalid_op+0x1f/0x30
> [  +0.000014]  ? __flush_work.isra.0+0x2e8/0x390
> [  +0.000007]  ? __flush_work.isra.0+0x208/0x390
> [  +0.000007]  ? _prb_read_valid+0x216/0x290
> [  +0.000008]  __cancel_work_timer+0x11d/0x1a0
> [  +0.000007]  ? try_to_grab_pending+0xe8/0x190
> [  +0.000012]  cancel_work_sync+0x14/0x20
> [  +0.000008]  amddrm_sched_stop+0x3c/0x1d0 [amd_sched]
> [  +0.000032]  amdgpu_device_gpu_recover+0x29a/0xe90 [amdgpu]
>
> This warning info was printed after applying the patch
> "drm/sched: Convert drm scheduler to use a work queue rather than kthread".
> The root cause is that amdgpu driver tries to use the uninitialized
> work_struct in the struct drm_gpu_scheduler
>
> v2:
>  - Rename the function to amdgpu_ring_sched_ready and move it to
> amdgpu_ring.c (Alex)
> v3:
> - Fix a few more checks based on Vitaly's patch (Alex)
>
> Fixes: 11b3b9f461c5 ("drm/sched: Check scheduler ready before calling timeout handling")
> Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
> Signed-off-by: Vitaly Prosyak <vitaly.prosyak at amd.com>
> Signed-off-by: Ma Jun <Jun.Ma2 at amd.com>
> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c    |  2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c        |  8 ++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c         | 12 ++++++------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c           | 14 +++++++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h           |  2 +-
>  5 files changed, 25 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index 899e31e3a5e8..3a3f3ce09f00 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -290,7 +290,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus
>          for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>                  struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
>  
> -               if (!(ring && drm_sched_wqueue_ready(&ring->sched)))
> +               if (!amdgpu_ring_sched_ready(ring))
>                          continue;
>  
>                  /* stop secheduler and drain ring. */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index e485dd3357c6..1afbb2e932c6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1678,7 +1678,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
>          for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>                  struct amdgpu_ring *ring = adev->rings[i];
>  
> -               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
> +               if (!amdgpu_ring_sched_ready(ring))
>                          continue;
>                  drm_sched_wqueue_stop(&ring->sched);
>          }
> @@ -1694,7 +1694,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)
>          for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>                  struct amdgpu_ring *ring = adev->rings[i];
>  
> -               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
> +               if (!amdgpu_ring_sched_ready(ring))
>                          continue;
>                  drm_sched_wqueue_start(&ring->sched);
>          }
> @@ -1916,8 +1916,8 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>  
>          ring = adev->rings[val];
>  
> -       if (!ring || !ring->funcs->preempt_ib ||
> -           !drm_sched_wqueue_ready(&ring->sched))
> +       if (!amdgpu_ring_sched_ready(ring) ||
> +           !ring->funcs->preempt_ib)
>                  return -EINVAL;
>  
>          /* the last preemption failed */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 1a04ccba9542..7ff17df7a5ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5042,7 +5042,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
>          for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                  struct amdgpu_ring *ring = adev->rings[i];
>  
> -               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
> +               if (!amdgpu_ring_sched_ready(ring))
>                          continue;
>  
>                  spin_lock(&ring->sched.job_list_lock);
> @@ -5181,7 +5181,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>          for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                  struct amdgpu_ring *ring = adev->rings[i];
>  
> -               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
> +               if (!amdgpu_ring_sched_ready(ring))
>                          continue;
>  
>                  /* Clear job fence from fence drv to avoid force_completion
> @@ -5648,7 +5648,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                  for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                          struct amdgpu_ring *ring = tmp_adev->rings[i];
>  
> -                       if (!ring || !drm_sched_wqueue_ready(&ring->sched))
> +                       if (!amdgpu_ring_sched_ready(ring))
>                                  continue;
>  
>                          drm_sched_stop(&ring->sched, job ? &job->base : NULL);
> @@ -5717,7 +5717,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>                  for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                          struct amdgpu_ring *ring = tmp_adev->rings[i];
>  
> -                       if (!ring || !drm_sched_wqueue_ready(&ring->sched))
> +                       if (!amdgpu_ring_sched_ready(ring))
>                                  continue;
>  
>                          drm_sched_start(&ring->sched, true);
> @@ -6072,7 +6072,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
>                  for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                          struct amdgpu_ring *ring = adev->rings[i];
>  
> -                       if (!ring || !drm_sched_wqueue_ready(&ring->sched))
> +                       if (!amdgpu_ring_sched_ready(ring))
>                                  continue;
>  
>                          drm_sched_stop(&ring->sched, NULL);
> @@ -6214,7 +6214,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
>          for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>                  struct amdgpu_ring *ring = adev->rings[i];
>  
> -               if (!ring || !drm_sched_wqueue_ready(&ring->sched))
> +               if (!amdgpu_ring_sched_ready(ring))
>                          continue;
>  
>                  drm_sched_start(&ring->sched, true);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 45424ebf9681..9ae386e9d41d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -634,7 +634,8 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
>                  DRM_DEV_DEBUG(adev->dev, "ring test on %s succeeded\n",
>                                ring->name);
>  
> -       ring->sched.ready = !r;
> +       if (!ring->no_scheduler)
> +               ring->sched.ready = !r;
>          return r;
>  }
>  
> @@ -717,3 +718,14 @@ void amdgpu_ring_ib_on_emit_de(struct amdgpu_ring *ring)
>          if (ring->is_sw_ring)
>                  amdgpu_sw_ring_ib_mark_offset(ring, AMDGPU_MUX_OFFSET_TYPE_DE);
>  }
> +
> +bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring)
> +{
> +       if (!ring)
> +               return false;
> +
> +       if (ring->no_scheduler || !drm_sched_wqueue_ready(&ring->sched))
> +               return false;
> +
> +       return true;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index bbb53720a018..fe1a61eb6e4c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -450,5 +450,5 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>  int amdgpu_ib_pool_init(struct amdgpu_device *adev);
>  void amdgpu_ib_pool_fini(struct amdgpu_device *adev);
>  int amdgpu_ib_ring_tests(struct amdgpu_device *adev);
> -
> +bool amdgpu_ring_sched_ready(struct amdgpu_ring *ring);
>  #endif
> -- 
> 2.42.0
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240129/1dfd0a0e/attachment-0001.htm>


More information about the amd-gfx mailing list