[PATCH 1/1] drm/amdkfd: Add eviction debug messages
philip yang
yangp at amd.com
Fri Jun 12 22:03:01 UTC 2020
It's good idea, better to add same print in system memory eviction path
amdgpu_amdkfd_evict_userptr.
Use WARN_ONCE to avoid duplicate messages.
Regards,
Philip
On 2020-06-11 11:34 p.m., Felix Kuehling wrote:
> Use WARN to print messages with backtrace when evictions are triggered.
> This can help determine the root cause of evictions and help spot driver
> bugs triggering evictions unintentionally, or help with performance tuning
> by avoiding conditions that cause evictions in a specific workload.
>
> The messages are controlled by a new module parameter that can be changed
> at runtime:
>
> echo Y > /sys/module/amdgpu/parameters/debug_evictions
> echo N > /sys/module/amdgpu/parameters/debug_evictions
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 ++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 2 ++
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3 +++
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +++++
> 5 files changed, 20 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 10ae92e835f6..6c7dd0a707c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -186,8 +186,10 @@ extern int amdgpu_noretry;
> extern int amdgpu_force_asic_type;
> #ifdef CONFIG_HSA_AMD
> extern int sched_policy;
> +extern bool debug_evictions;
> #else
> static const int sched_policy = KFD_SCHED_POLICY_HWS;
> +static const bool debug_evictions; /* = false */
> #endif
>
> extern int amdgpu_tmz;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index d4d7cca1cc72..fdf350d5e7b7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -705,6 +705,14 @@ MODULE_PARM_DESC(hws_gws_support, "Assume MEC2 FW supports GWS barriers (false =
> int queue_preemption_timeout_ms = 9000;
> module_param(queue_preemption_timeout_ms, int, 0644);
> MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption timeout in ms (1 = Minimum, 9000 = default)");
> +
> +/**
> + * DOC: debug_evictions(bool)
> + * Enable extra debug messages to help determine the cause of evictions
> + */
> +bool debug_evictions;
> +module_param(debug_evictions, bool, 0644);
> +MODULE_PARM_DESC(debug_evictions, "enable eviction debug messages (false = default)");
> #endif
>
> /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> index b87ca171986a..072f0e1185a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> @@ -275,6 +275,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
> continue;
> }
>
> + WARN(debug_evictions && fence_owner == AMDGPU_FENCE_OWNER_KFD,
> + "Adding eviction fence to sync obj");
> r = amdgpu_sync_fence(sync, f, false);
> if (r)
> break;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 22348cebaf36..80393e0583bb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -942,6 +942,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
> if (!p)
> return -ESRCH;
>
> + WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
> r = kfd_process_evict_queues(p);
>
> kfd_unref_process(p);
> @@ -1009,6 +1010,8 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
> /* During process initialization eviction_work.dwork is initialized
> * to kfd_evict_bo_worker
> */
> + WARN(debug_evictions, "Scheduling eviction of pid %d in %ld jiffies",
> + p->lead_thread->pid, delay_jiffies);
> schedule_delayed_work(&p->eviction_work, delay_jiffies);
> out:
> kfd_unref_process(p);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 173d58b2d81f..51ba2020732e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -177,6 +177,11 @@ extern bool hws_gws_support;
> */
> extern int queue_preemption_timeout_ms;
>
> +/*
> + * Enable eviction debug messages
> + */
> +extern bool debug_evictions;
> +
> enum cache_policy {
> cache_policy_coherent,
> cache_policy_noncoherent
More information about the amd-gfx
mailing list