[PATCH 1/1] drm/amdkfd: Add eviction debug messages
philip yang
yangp at amd.com
Sat Jun 13 02:09:31 UTC 2020
Thanks for the explanation.
Reviewed-by: Philip Yang <Philip.Yang at amd.com>
On 2020-06-12 7:43 p.m., Felix Kuehling wrote:
> Am 2020-06-12 um 6:03 p.m. schrieb philip yang:
>> It's good idea, better to add same print in system memory eviction
>> path amdgpu_amdkfd_evict_userptr.
> That's covered by the message in kgd2kfd_quiesce_mm.
>
>
>> Use WARN_ONCE to avoid duplicate messages.
> I want duplicate messages. If many different kinds of evictions are
> happening I want to see them all. The module parameter is there so I can
> turn it on/off for short bursts while interesting things are happening.
> It's off by default.
>
> I was considering WARN_RATELIMIT, but that may skip interesting
> evictions I actually want to see.
>
> Regards,
> Felix
>
>
>> Regards,
>>
>> Philip
>>
>>
>> On 2020-06-11 11:34 p.m., Felix Kuehling wrote:
>>> Use WARN to print messages with backtrace when evictions are triggered.
>>> This can help determine the root cause of evictions and help spot driver
>>> bugs triggering evictions unintentionally, or help with performance
>>> tuning
>>> by avoiding conditions that cause evictions in a specific workload.
>>>
>>> The messages are controlled by a new module parameter that can be
>>> changed
>>> at runtime:
>>>
>>> echo Y > /sys/module/amdgpu/parameters/debug_evictions
>>> echo N > /sys/module/amdgpu/parameters/debug_evictions
>>>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 ++++++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 2 ++
>>> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 3 +++
>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +++++
>>> 5 files changed, 20 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 10ae92e835f6..6c7dd0a707c9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -186,8 +186,10 @@ extern int amdgpu_noretry;
>>> extern int amdgpu_force_asic_type;
>>> #ifdef CONFIG_HSA_AMD
>>> extern int sched_policy;
>>> +extern bool debug_evictions;
>>> #else
>>> static const int sched_policy = KFD_SCHED_POLICY_HWS;
>>> +static const bool debug_evictions; /* = false */
>>> #endif
>>> extern int amdgpu_tmz;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index d4d7cca1cc72..fdf350d5e7b7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -705,6 +705,14 @@ MODULE_PARM_DESC(hws_gws_support, "Assume MEC2
>>> FW supports GWS barriers (false =
>>> int queue_preemption_timeout_ms = 9000;
>>> module_param(queue_preemption_timeout_ms, int, 0644);
>>> MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption
>>> timeout in ms (1 = Minimum, 9000 = default)");
>>> +
>>> +/**
>>> + * DOC: debug_evictions(bool)
>>> + * Enable extra debug messages to help determine the cause of evictions
>>> + */
>>> +bool debug_evictions;
>>> +module_param(debug_evictions, bool, 0644);
>>> +MODULE_PARM_DESC(debug_evictions, "enable eviction debug messages
>>> (false = default)");
>>> #endif
>>> /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>> index b87ca171986a..072f0e1185a8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>> @@ -275,6 +275,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
>>> struct amdgpu_sync *sync,
>>> continue;
>>> }
>>> + WARN(debug_evictions && fence_owner ==
>>> AMDGPU_FENCE_OWNER_KFD,
>>> + "Adding eviction fence to sync obj");
>>> r = amdgpu_sync_fence(sync, f, false);
>>> if (r)
>>> break;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 22348cebaf36..80393e0583bb 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -942,6 +942,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
>>> if (!p)
>>> return -ESRCH;
>>> + WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
>>> r = kfd_process_evict_queues(p);
>>> kfd_unref_process(p);
>>> @@ -1009,6 +1010,8 @@ int
>>> kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>>> /* During process initialization eviction_work.dwork is
>>> initialized
>>> * to kfd_evict_bo_worker
>>> */
>>> + WARN(debug_evictions, "Scheduling eviction of pid %d in %ld
>>> jiffies",
>>> + p->lead_thread->pid, delay_jiffies);
>>> schedule_delayed_work(&p->eviction_work, delay_jiffies);
>>> out:
>>> kfd_unref_process(p);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 173d58b2d81f..51ba2020732e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -177,6 +177,11 @@ extern bool hws_gws_support;
>>> */
>>> extern int queue_preemption_timeout_ms;
>>> +/*
>>> + * Enable eviction debug messages
>>> + */
>>> +extern bool debug_evictions;
>>> +
>>> enum cache_policy {
>>> cache_policy_coherent,
>>> cache_policy_noncoherent
More information about the amd-gfx
mailing list