[PATCH] drm/amdgpu: Added Debug Logging for Process Isolation

Russell, Kent Kent.Russell at amd.com
Mon Jan 6 14:21:03 UTC 2025


[AMD Official Use Only - AMD Internal Distribution Only]

Just for my personal edification, do we need __func__ for *_dbg calls? We can already add the function name when enabling dyndbg with +f , and the messages are unique here so there's no real need for an identifier. Just wondering if my assumptions are incorrect on that. Thanks!

 Kent

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Srinivasan
> Shanmugam
> Sent: Monday, January 6, 2025 7:44 AM
> To: Koenig, Christian <Christian.Koenig at amd.com>; Deucher, Alexander
> <Alexander.Deucher at amd.com>
> Cc: amd-gfx at lists.freedesktop.org; SHANMUGAM, SRINIVASAN
> <SRINIVASAN.SHANMUGAM at amd.com>
> Subject: [PATCH] drm/amdgpu: Added Debug Logging for Process Isolation
>
> Added debug logging to provide insights into KGD/KFD scheduling, cleaner
> shader emission, and isolation processes.
>
> Cc: Christian König <christian.koenig at amd.com>
> Cc: Alex Deucher <alexander.deucher at amd.com>
> Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 30 +++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  |  5 ++++-
>  2 files changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 89f7c89d1392..ef985a7bda1a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1912,16 +1912,19 @@ static void amdgpu_gfx_kfd_sch_ctrl(struct
> amdgpu_device *adev, u32 idx,
>
>               adev->gfx.kfd_sch_req_count[idx]--;
>
> +             dev_dbg(adev->dev, "%s: Enabling KFD scheduler for idx: %u\n",
> __func__, idx);
>               if (adev->gfx.kfd_sch_req_count[idx] == 0 &&
>                   adev->gfx.kfd_sch_inactive[idx]) {
>                       schedule_delayed_work(&adev-
> >gfx.enforce_isolation[idx].work,
>                                             msecs_to_jiffies(adev-
> >gfx.enforce_isolation_time[idx]));
>               }
>       } else {
> +             dev_dbg(adev->dev, "%s: Disabling KFD scheduler for idx: %u\n",
> __func__, idx);
>               if (adev->gfx.kfd_sch_req_count[idx] == 0) {
>                       cancel_delayed_work_sync(&adev-
> >gfx.enforce_isolation[idx].work);
>                       if (!adev->gfx.kfd_sch_inactive[idx]) {
>                               amdgpu_amdkfd_stop_sched(adev, idx);
> +                             dev_dbg(adev->dev, "%s: KFD stopped\n", __func__);
>                               adev->gfx.kfd_sch_inactive[idx] = true;
>                       }
>               }
> @@ -1960,7 +1963,12 @@ void amdgpu_gfx_enforce_isolation_handler(struct
> work_struct *work)
>       if (idx >= MAX_XCP)
>               return;
>
> +     dev_dbg(adev->dev, "%s: Starting enforce isolation handler for idx: %u\n",
> +             __func__, idx);
> +
>       mutex_lock(&adev->enforce_isolation_mutex);
> +     dev_dbg(adev->dev, "%s: Processing fences for idx: %u. Checking GFX and
> Compute rings.\n",
> +             __func__, idx);
>       for (i = 0; i < AMDGPU_MAX_GFX_RINGS; ++i) {
>               if (isolation_work->xcp_id == adev->gfx.gfx_ring[i].xcp_id)
>                       fences += amdgpu_fence_count_emitted(&adev-
> >gfx.gfx_ring[i]);
> @@ -1969,16 +1977,23 @@ void amdgpu_gfx_enforce_isolation_handler(struct
> work_struct *work)
>               if (isolation_work->xcp_id == adev->gfx.compute_ring[i].xcp_id)
>                       fences += amdgpu_fence_count_emitted(&adev-
> >gfx.compute_ring[i]);
>       }
> +     dev_dbg(adev->dev, "%s: Total fences processed for idx: %u: %u\n",
> +             __func__, idx, fences);
>       if (fences) {
> +             dev_dbg(adev->dev, "%s: Fences detected for idx: %u, scheduling
> delayed work\n",
> +                     __func__, idx);
>               /* we've already had our timeslice, so let's wrap this up */
>               schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
>                                     msecs_to_jiffies(1));
>       } else {
> +             dev_dbg(adev->dev, "%s: No fences detected for idx: %u, no delayed
> work scheduled\n",
> +                     __func__, idx);
>               /* Tell KFD to resume the runqueue */
>               if (adev->kfd.init_complete) {
>                       WARN_ON_ONCE(!adev->gfx.kfd_sch_inactive[idx]);
>                       WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx]);
>                               amdgpu_amdkfd_start_sched(adev, idx);
> +                             dev_dbg(adev->dev, "%s: KFD started\n", __func__);
>                               adev->gfx.kfd_sch_inactive[idx] = false;
>               }
>       }
> @@ -2009,6 +2024,8 @@ amdgpu_gfx_enforce_isolation_wait_for_kfd(struct
> amdgpu_device *adev,
>               if (!adev->gfx.enforce_isolation_jiffies[idx]) {
>                       adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
>                       adev->gfx.enforce_isolation_time[idx] =
>       GFX_SLICE_PERIOD_MS;
> +                     dev_dbg(adev->dev, "%s: Initializing time slice for idx: %u.
> Setting start time and duration for fair GPU access between KGD and KFD. Initial
> jiffies: %lu, Time slice: %u ms\n",
> +                             __func__, idx, jiffies, GFX_SLICE_PERIOD_MS);
>               }
>               /* Make sure KFD gets a chance to run */
>               if (amdgpu_amdkfd_compute_active(adev, idx)) {
> @@ -2020,19 +2037,28 @@ amdgpu_gfx_enforce_isolation_wait_for_kfd(struct
> amdgpu_device *adev,
>                                       wait = true;
>                                       /* reset the timer period */
>                                       adev->gfx.enforce_isolation_time[idx] =
>       GFX_SLICE_PERIOD_MS;
> +                                     dev_dbg(adev->dev, "%s: Time slice expired
> for idx: %u. Allowing KGD to finish its work before KFD resumes. Resetting time slice
> to %u ms\n",
> +                                             __func__, idx,
> GFX_SLICE_PERIOD_MS);
>                               } else {
>                                       /* set the timer period to what's left in our time
> slice */
>                                       adev->gfx.enforce_isolation_time[idx] =
>                                               GFX_SLICE_PERIOD_MS -
> jiffies_to_msecs(cjiffies);
> +                                     dev_dbg(adev->dev, "%s: Adjusting remaining
> time slice for idx: %u. KFD can use the GPU for %lu more ms\n",
> +                                             __func__, idx,
> +                                             adev->gfx.enforce_isolation_time[idx]);
>                               }
>                       } else {
>                               /* if jiffies wrap around we will just wait a little longer */
>                               adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
> +                             dev_dbg(adev->dev, "%s: Jiffies reset for idx: %u.
> Starting new time slice to ensure fair GPU access between KGD and KFD. New
> jiffies: %lu\n",
> +                                     __func__, idx, jiffies);
>                       }
>               } else {
>                       /* if there is no KFD work, then set the full slice period */
>                       adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
>                       adev->gfx.enforce_isolation_time[idx] =
> GFX_SLICE_PERIOD_MS;
> +                     dev_dbg(adev->dev, "%s: No KFD work for idx: %u. Setting
> full slice period to %u ms\n",
> +                             __func__, idx, GFX_SLICE_PERIOD_MS);
>               }
>       }
>       mutex_unlock(&adev->enforce_isolation_mutex);
> @@ -2055,6 +2081,8 @@ void
> amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
>       struct amdgpu_device *adev = ring->adev;
>       u32 idx;
>
> +     dev_dbg(adev->dev, "%s: begin_use\n", __func__);
> +
>       if (!adev->gfx.enable_cleaner_shader)
>               return;
>
> @@ -2089,6 +2117,8 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct
> amdgpu_ring *ring)
>       struct amdgpu_device *adev = ring->adev;
>       u32 idx;
>
> +     dev_dbg(adev->dev, "%s: end_use\n", __func__);
> +
>       if (!adev->gfx.enable_cleaner_shader)
>               return;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index c9c48b782ec1..06b527388402 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -688,8 +688,11 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct
> amdgpu_job *job,
>
>       if (adev->gfx.enable_cleaner_shader &&
>           ring->funcs->emit_cleaner_shader &&
> -         job->enforce_isolation)
> +         job->enforce_isolation) {
> +             dev_dbg(ring->adev->dev, "%s: Emitting cleaner shader for ring:
> %s\n",
> +                     __func__, ring->name);
>               ring->funcs->emit_cleaner_shader(ring);
> +     }
>
>       if (vm_flush_needed) {
>               trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
> --
> 2.34.1



More information about the amd-gfx mailing list